Upload sentiment-transformer model

Browse files

Files changed (8) hide show

README.md +62 -0
config.json +31 -0
configuration_sentiment_transformer.py +82 -0
example.py +87 -0
model.safetensors +3 -0
modeling_sentiment_transformer.py +359 -0
tokenizer.json +0 -0
tokenizer_config.json +16 -0

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+pipeline_tag: text-classification
+tags:
+  - sentiment-analysis
+  - transformer
+  - custom
+  - pytorch
+  - trained-from-scratch
+datasets:
+  - stanfordnlp/imdb
+  - stanfordnlp/sentiment140
+  - SetFit/sst5
+  - financial_phrasebank
+  - tweet_eval
+language:
+  - en
+license: mit
+---
+# Sentiment Transformer — tango
+A small (≈13M parameter) transformer encoder trained **entirely from scratch** for
+3-class sentiment analysis (negative / neutral / positive).
+## Architecture
+Pre-layer-norm transformer encoder with [CLS] pooling and a linear classification head.
+Built with pure `torch.nn` — no pretrained weights.
+| Parameter | Value |
+|---|---|
+| Hidden dim | 256 |
+| FFN dim | 1024 |
+| Layers | 6 |
+| Heads | 8 |
+| Max seq len | 256 |
+| Vocab size | 16000 |
+| Labels | NEGATIVE, NEUTRAL, POSITIVE |
+| Precision | bf16 mixed-precision |
+## Training Data
+Trained on a combined corpus of:
+- **IMDB** (50k movie reviews)
+- **Sentiment140** (1M tweets)
+- **Yelp** (1M reviews)
+- **SST-5** (fine-grained → 3-class)
+- **Financial PhraseBank** (finance headlines)
+- **TweetEval** (SemEval-2017 tweets)
+## Usage
+```python
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+model = AutoModelForSequenceClassification.from_pretrained(
+    "Impulse2000/sentiment-transformer", trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained("Impulse2000/sentiment-transformer")
+pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
+print(pipe("This movie was absolutely fantastic!"))
+```

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "SentimentTransformerForSequenceClassification"
+  ],
+  "dtype": "float32",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 256,
+  "id2label": {
+    "0": "NEGATIVE",
+    "1": "NEUTRAL",
+    "2": "POSITIVE"
+  },
+  "intermediate_size": 1024,
+  "label2id": {
+    "NEGATIVE": 0,
+    "NEUTRAL": 1,
+    "POSITIVE": 2
+  },
+  "max_position_embeddings": 256,
+  "model_type": "sentiment-transformer",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "transformers_version": "5.5.0",
+  "vocab_size": 16000,
+  "auto_map": {
+    "AutoConfig": "configuration_sentiment_transformer.SentimentTransformerConfig",
+    "AutoModelForSequenceClassification": "modeling_sentiment_transformer.SentimentTransformerForSequenceClassification"
+  }
+}

configuration_sentiment_transformer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Hugging Face configuration for the Sentiment Transformer.
+This file is **self-contained** — it has no dependency on the project's
+``config.py`` or ``config.toml``.  It is copied verbatim into every HF
+export directory so that ``AutoConfig.from_pretrained()`` works with
+``trust_remote_code=True``.
+"""
+from __future__ import annotations
+from transformers import PretrainedConfig
+class SentimentTransformerConfig(PretrainedConfig):
+    """HuggingFace-compatible configuration for the custom sentiment
+    transformer encoder classifier.
+    This maps the project's internal hyperparameter names to the
+    canonical HF field names used by ``AutoConfig`` / ``AutoModel``.
+    Attributes
+    ----------
+    vocab_size : int
+        Size of the BPE vocabulary.
+    hidden_size : int
+        Embedding / hidden dimension of the transformer.
+    intermediate_size : int
+        Inner (expanded) dimension of the position-wise FFN.
+    num_hidden_layers : int
+        Number of stacked transformer encoder blocks.
+    num_attention_heads : int
+        Number of parallel attention heads.
+    max_position_embeddings : int
+        Maximum supported input sequence length.
+    hidden_dropout_prob : float
+        Dropout probability used throughout the model.
+    num_labels : int
+        Number of output classes (2 for binary, 3 for ternary, etc.).
+    """
+    model_type = "sentiment-transformer"
+    def __init__(
+        self,
+        vocab_size: int = 16_000,
+        hidden_size: int = 256,
+        intermediate_size: int = 1024,
+        num_hidden_layers: int = 6,
+        num_attention_heads: int = 8,
+        max_position_embeddings: int = 256,
+        hidden_dropout_prob: float = 0.1,
+        num_labels: int = 2,
+        pad_token_id: int = 0,
+        id2label: dict[int, str] | None = None,
+        label2id: dict[str, int] | None = None,
+        **kwargs,
+    ) -> None:
+        # When loading from a serialized config.json, `id2label` and
+        # `num_labels` may both be present.  HF's PreTrainedConfig sets
+        # ``num_labels = 2`` as a hidden default, which overrides the
+        # id2label we saved.  Reconcile by deriving from id2label.
+        if id2label is not None and len(id2label) != num_labels:
+            num_labels = len(id2label)
+        # `problem_type` may already be present in kwargs when loading from
+        # a serialized config.json — use setdefault to avoid duplicate kwarg.
+        kwargs.setdefault("problem_type", "single_label_classification")
+        super().__init__(
+            pad_token_id=pad_token_id,
+            num_labels=num_labels,
+            id2label=id2label,
+            label2id=label2id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_dropout_prob = hidden_dropout_prob

example.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Example usage of the Sentiment Transformer with HuggingFace Transformers.
+This file is included in every HF export directory as a quick-start reference.
+Usage::
+    python example.py
+    python example.py --text "This movie was incredible!"
+"""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Quick-start example for the Sentiment Transformer.",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        default=None,
+        help="Single text to classify. If omitted, runs built-in examples.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=str(Path(__file__).resolve().parent),
+        help="Path to the HF model directory. Defaults to this file's directory.",
+    )
+    args = parser.parse_args()
+    try:
+        from transformers import (
+            AutoModelForSequenceClassification,
+            AutoTokenizer,
+            pipeline,
+        )
+    except ImportError:
+        print("ERROR: `transformers` is required. Install with:")
+        print("  pip install transformers torch")
+        sys.exit(1)
+    print(f"Loading model from: {args.model_dir}")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        args.model_dir, trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
+    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
+    print(f"Model: {type(model).__name__}")
+    print(f"Labels: {model.config.id2label}")
+    print()
+    if args.text:
+        texts = [args.text]
+    else:
+        texts = [
+            "This movie was absolutely fantastic! I loved every minute of it.",
+            "Terrible film, completely unwatchable garbage.",
+            "The movie was okay, nothing special really.",
+            "An outstanding performance by the entire cast.",
+            "I fell asleep halfway through. Waste of time.",
+        ]
+    results = pipe(texts)
+    for text, result in zip(texts, results):
+        label = result["label"]
+        score = result["score"]
+        print(f"  {label:8s} ({score:.4f})  {text}")
+    # Top-k example
+    print("\n--- Top-k prediction ---")
+    sample = texts[0]
+    top_k = pipe(sample, top_k=None)
+    print(f"  \"{sample[:60]}...\"")
+    for r in top_k:
+        bar = "█" * int(r["score"] * 40)
+        print(f"    {r['label']:8s} {r['score']:.4f}  {bar}")
+if __name__ == "__main__":
+    main()

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f1f663368d88b5e829a71ce47cd55fefd8ae32fb52fc7b328cf58b2e86ca838
+size 35684012

modeling_sentiment_transformer.py ADDED Viewed

	@@ -0,0 +1,359 @@

+"""
+Hugging Face model definition for the Sentiment Transformer.
+This file is **self-contained** — it depends only on ``torch`` and
+``transformers``.  It is copied verbatim into every HF export directory
+so that ``AutoModelForSequenceClassification.from_pretrained()`` works
+with ``trust_remote_code=True``.
+Architecture
+------------
+Token Embedding + RoPE (Rotary Positional Embedding)
+    -> N x TransformerEncoderBlock (pre-layer-norm, SwiGLU FFN)
+    -> Final LayerNorm
+    -> Mean pooling (masked)
+    -> 2-layer MLP classification head (num_labels-class logits)
+"""
+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import SequenceClassifierOutput
+from configuration_sentiment_transformer import SentimentTransformerConfig
+# ---------------------------------------------------------------------------
+# Rotary Positional Embedding (RoPE)
+# ---------------------------------------------------------------------------
+class RotaryEmbedding(nn.Module):
+    """Precompute and cache the sin/cos frequencies for RoPE.
+    RoPE encodes absolute position through *rotation* applied to pairs of
+    dimensions in Q and K.  This gives the dot-product between Q_i and K_j
+    a natural dependence on relative position (i - j) without any learnable
+    parameters.
+    """
+    def __init__(self, head_dim: int, max_seq_len: int, base: float = 10000.0) -> None:
+        super().__init__()
+        assert head_dim % 2 == 0, "head_dim must be even for RoPE"
+        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(max_seq_len).float()
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cached", freqs.cos(), persistent=False)
+        self.register_buffer("sin_cached", freqs.sin(), persistent=False)
+    def forward(self, seq_len: int) -> tuple[torch.Tensor, torch.Tensor]:
+        """Return (cos, sin) each of shape (seq_len, head_dim // 2)."""
+        return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
+def _apply_rope(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    """Apply rotary embedding to a Q or K tensor.
+    Parameters
+    ----------
+    x : Tensor, shape ``(B, num_heads, S, head_dim)``
+    cos, sin : Tensor, shape ``(S, head_dim // 2)``
+    Returns
+    -------
+    Tensor, same shape as ``x``.
+    """
+    x1 = x[..., 0::2]  # even indices
+    x2 = x[..., 1::2]  # odd indices
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    out1 = x1 * cos - x2 * sin
+    out2 = x1 * sin + x2 * cos
+    return torch.stack((out1, out2), dim=-1).flatten(-2)
+# ---------------------------------------------------------------------------
+# Building blocks
+# ---------------------------------------------------------------------------
+class MultiHeadSelfAttention(nn.Module):
+    """Multi-head self-attention with RoPE and fused SDPA kernel.
+    Automatically dispatches to FlashAttention or Memory-Efficient
+    Attention when running on a compatible GPU.
+    """
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_heads: int,
+        dropout: float,
+        rope: RotaryEmbedding,
+    ) -> None:
+        super().__init__()
+        assert hidden_dim % num_heads == 0, (
+            f"hidden_dim ({hidden_dim}) must be divisible by num_heads ({num_heads})"
+        )
+        self.num_heads = num_heads
+        self.head_dim = hidden_dim // num_heads
+        self.dropout = dropout
+        self.rope = rope
+        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
+        self.k_proj = nn.Linear(hidden_dim, hidden_dim)
+        self.v_proj = nn.Linear(hidden_dim, hidden_dim)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        B, S, H = x.shape
+        q = self.q_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K
+        cos, sin = self.rope(S)
+        q = _apply_rope(q, cos, sin)
+        k = _apply_rope(k, cos, sin)
+        attn_mask = None
+        if attention_mask is not None:
+            attn_mask = attention_mask.bool().unsqueeze(1).unsqueeze(2)
+        attn_out = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+        )
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, S, H)
+        return self.out_proj(attn_out)
+class SwiGLUFeedForward(nn.Module):
+    """SwiGLU feed-forward network (as used in LLaMA / Gemma).
+    SwiGLU(x) = W_down · (SiLU(W_gate · x) ⊙ W_up · x)
+    """
+    def __init__(self, hidden_dim: int, ffn_dim: int, dropout: float) -> None:
+        super().__init__()
+        inner_dim = int(2 / 3 * ffn_dim)
+        inner_dim = ((inner_dim + 7) // 8) * 8  # round up to multiple of 8
+        self.w_gate = nn.Linear(hidden_dim, inner_dim, bias=False)
+        self.w_up = nn.Linear(hidden_dim, inner_dim, bias=False)
+        self.w_down = nn.Linear(inner_dim, hidden_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.w_down(F.silu(self.w_gate(x)) * self.w_up(x)))
+class TransformerEncoderBlock(nn.Module):
+    """Single transformer encoder block with **pre-layer-norm** and SwiGLU.
+    Pre-LN applies LayerNorm *before* each sub-layer:
+        x = x + Attention(LayerNorm(x))
+        x = x + SwiGLU_FFN(LayerNorm(x))
+    """
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        dropout: float,
+        rope: RotaryEmbedding,
+    ) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.attn = MultiHeadSelfAttention(hidden_dim, num_heads, dropout, rope)
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.ffn = SwiGLUFeedForward(hidden_dim, ffn_dim, dropout)
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = x + self.dropout(self.attn(self.norm1(x), attention_mask))
+        x = x + self.dropout(self.ffn(self.norm2(x)))
+        return x
+class SentimentTransformerBackbone(nn.Module):
+    """Transformer encoder for sentiment classification.
+    Uses mean pooling over non-padding tokens and a 2-layer MLP
+    classification head.  Returns raw logits (no softmax).
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        hidden_dim: int,
+        ffn_dim: int,
+        num_layers: int,
+        num_heads: int,
+        max_seq_len: int,
+        num_classes: int,
+        dropout: float = 0.1,
+    ) -> None:
+        super().__init__()
+        self.token_embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
+        self.embedding_dropout = nn.Dropout(dropout)
+        # Shared RoPE module
+        head_dim = hidden_dim // num_heads
+        self.rope = RotaryEmbedding(head_dim, max_seq_len)
+        self.layers = nn.ModuleList([
+            TransformerEncoderBlock(
+                hidden_dim=hidden_dim,
+                num_heads=num_heads,
+                ffn_dim=ffn_dim,
+                dropout=dropout,
+                rope=self.rope,
+            )
+            for _ in range(num_layers)
+        ])
+        self.final_norm = nn.LayerNorm(hidden_dim)
+        # 2-layer MLP classification head
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, num_classes),
+        )
+        self._init_weights()
+    def _init_weights(self) -> None:
+        """Xavier-uniform for linear layers, normal for embeddings."""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                nn.init.normal_(module.weight, mean=0.0, std=0.02)
+                if module.padding_idx is not None:
+                    with torch.no_grad():
+                        module.weight[module.padding_idx].fill_(0)
+            elif isinstance(module, nn.LayerNorm):
+                nn.init.ones_(module.weight)
+                nn.init.zeros_(module.bias)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        B, S = input_ids.shape
+        # Token embeddings only — positional information injected via RoPE
+        x = self.embedding_dropout(self.token_embedding(input_ids))
+        for layer in self.layers:
+            x = layer(x, attention_mask)
+        x = self.final_norm(x)
+        # Mean pooling over non-padding tokens
+        mask = attention_mask.unsqueeze(-1).float()  # (B, S, 1)
+        pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)  # (B, H)
+        logits = self.classifier(pooled)
+        return logits
+# ---------------------------------------------------------------------------
+# HuggingFace PreTrainedModel wrapper
+# ---------------------------------------------------------------------------
+class SentimentTransformerForSequenceClassification(PreTrainedModel):
+    """HuggingFace-compatible sequence classification wrapper.
+    This class bridges the custom transformer backbone with the HF
+    ecosystem.  It accepts the standard ``input_ids``, ``attention_mask``,
+    and ``labels`` arguments and returns a
+    :class:`~transformers.modeling_outputs.SequenceClassifierOutput`.
+    Usage::
+        from transformers import AutoModelForSequenceClassification, pipeline
+        model = AutoModelForSequenceClassification.from_pretrained(
+            "path/to/export", trust_remote_code=True
+        )
+        pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
+        pipe("This movie was amazing!")
+    """
+    config_class = SentimentTransformerConfig
+    base_model_prefix = "backbone"
+    main_input_name = "input_ids"
+    def __init__(self, config: SentimentTransformerConfig) -> None:
+        super().__init__(config)
+        self.backbone = SentimentTransformerBackbone(
+            vocab_size=config.vocab_size,
+            hidden_dim=config.hidden_size,
+            ffn_dim=config.intermediate_size,
+            num_layers=config.num_hidden_layers,
+            num_heads=config.num_attention_heads,
+            max_seq_len=config.max_position_embeddings,
+            num_classes=config.num_labels,
+            dropout=config.hidden_dropout_prob,
+        )
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+        return_dict: bool | None = None,
+        **_kwargs,
+    ) -> SequenceClassifierOutput | tuple[torch.Tensor, ...]:
+        """Run sequence classification and return HF-style outputs."""
+        if input_ids is None:
+            raise ValueError("`input_ids` is required.")
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        logits = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(logits, labels)
+        use_return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        if not use_return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(loss=loss, logits=logits)

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "max_length": 256,
+  "model_max_length": 256,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "tokenizer_class": "TokenizersBackend",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}