Impulse2000 commited on
Commit
21035f8
·
verified ·
1 Parent(s): 915c41c

Upload sentiment-transformer model

Browse files
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: text-classification
3
+ tags:
4
+ - sentiment-analysis
5
+ - transformer
6
+ - custom
7
+ - pytorch
8
+ - trained-from-scratch
9
+ datasets:
10
+ - stanfordnlp/imdb
11
+ - stanfordnlp/sentiment140
12
+ - SetFit/sst5
13
+ - financial_phrasebank
14
+ - tweet_eval
15
+ language:
16
+ - en
17
+ license: mit
18
+ ---
19
+
20
+ # Sentiment Transformer — tango
21
+
22
+ A small (≈13M parameter) transformer encoder trained **entirely from scratch** for
23
+ 3-class sentiment analysis (negative / neutral / positive).
24
+
25
+ ## Architecture
26
+
27
+ Pre-layer-norm transformer encoder with [CLS] pooling and a linear classification head.
28
+ Built with pure `torch.nn` — no pretrained weights.
29
+
30
+ | Parameter | Value |
31
+ |---|---|
32
+ | Hidden dim | 256 |
33
+ | FFN dim | 1024 |
34
+ | Layers | 6 |
35
+ | Heads | 8 |
36
+ | Max seq len | 256 |
37
+ | Vocab size | 16000 |
38
+ | Labels | NEGATIVE, NEUTRAL, POSITIVE |
39
+ | Precision | bf16 mixed-precision |
40
+
41
+ ## Training Data
42
+
43
+ Trained on a combined corpus of:
44
+ - **IMDB** (50k movie reviews)
45
+ - **Sentiment140** (1M tweets)
46
+ - **Yelp** (1M reviews)
47
+ - **SST-5** (fine-grained → 3-class)
48
+ - **Financial PhraseBank** (finance headlines)
49
+ - **TweetEval** (SemEval-2017 tweets)
50
+
51
+ ## Usage
52
+
53
+ ```python
54
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
55
+
56
+ model = AutoModelForSequenceClassification.from_pretrained(
57
+ "Impulse2000/sentiment-transformer", trust_remote_code=True
58
+ )
59
+ tokenizer = AutoTokenizer.from_pretrained("Impulse2000/sentiment-transformer")
60
+ pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
61
+ print(pipe("This movie was absolutely fantastic!"))
62
+ ```
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SentimentTransformerForSequenceClassification"
4
+ ],
5
+ "dtype": "float32",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 256,
8
+ "id2label": {
9
+ "0": "NEGATIVE",
10
+ "1": "NEUTRAL",
11
+ "2": "POSITIVE"
12
+ },
13
+ "intermediate_size": 1024,
14
+ "label2id": {
15
+ "NEGATIVE": 0,
16
+ "NEUTRAL": 1,
17
+ "POSITIVE": 2
18
+ },
19
+ "max_position_embeddings": 256,
20
+ "model_type": "sentiment-transformer",
21
+ "num_attention_heads": 8,
22
+ "num_hidden_layers": 6,
23
+ "pad_token_id": 0,
24
+ "problem_type": "single_label_classification",
25
+ "transformers_version": "5.5.0",
26
+ "vocab_size": 16000,
27
+ "auto_map": {
28
+ "AutoConfig": "configuration_sentiment_transformer.SentimentTransformerConfig",
29
+ "AutoModelForSequenceClassification": "modeling_sentiment_transformer.SentimentTransformerForSequenceClassification"
30
+ }
31
+ }
configuration_sentiment_transformer.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face configuration for the Sentiment Transformer.
3
+
4
+ This file is **self-contained** — it has no dependency on the project's
5
+ ``config.py`` or ``config.toml``. It is copied verbatim into every HF
6
+ export directory so that ``AutoConfig.from_pretrained()`` works with
7
+ ``trust_remote_code=True``.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from transformers import PretrainedConfig
13
+
14
+
15
+ class SentimentTransformerConfig(PretrainedConfig):
16
+ """HuggingFace-compatible configuration for the custom sentiment
17
+ transformer encoder classifier.
18
+
19
+ This maps the project's internal hyperparameter names to the
20
+ canonical HF field names used by ``AutoConfig`` / ``AutoModel``.
21
+
22
+ Attributes
23
+ ----------
24
+ vocab_size : int
25
+ Size of the BPE vocabulary.
26
+ hidden_size : int
27
+ Embedding / hidden dimension of the transformer.
28
+ intermediate_size : int
29
+ Inner (expanded) dimension of the position-wise FFN.
30
+ num_hidden_layers : int
31
+ Number of stacked transformer encoder blocks.
32
+ num_attention_heads : int
33
+ Number of parallel attention heads.
34
+ max_position_embeddings : int
35
+ Maximum supported input sequence length.
36
+ hidden_dropout_prob : float
37
+ Dropout probability used throughout the model.
38
+ num_labels : int
39
+ Number of output classes (2 for binary, 3 for ternary, etc.).
40
+ """
41
+
42
+ model_type = "sentiment-transformer"
43
+
44
+ def __init__(
45
+ self,
46
+ vocab_size: int = 16_000,
47
+ hidden_size: int = 256,
48
+ intermediate_size: int = 1024,
49
+ num_hidden_layers: int = 6,
50
+ num_attention_heads: int = 8,
51
+ max_position_embeddings: int = 256,
52
+ hidden_dropout_prob: float = 0.1,
53
+ num_labels: int = 2,
54
+ pad_token_id: int = 0,
55
+ id2label: dict[int, str] | None = None,
56
+ label2id: dict[str, int] | None = None,
57
+ **kwargs,
58
+ ) -> None:
59
+ # When loading from a serialized config.json, `id2label` and
60
+ # `num_labels` may both be present. HF's PreTrainedConfig sets
61
+ # ``num_labels = 2`` as a hidden default, which overrides the
62
+ # id2label we saved. Reconcile by deriving from id2label.
63
+ if id2label is not None and len(id2label) != num_labels:
64
+ num_labels = len(id2label)
65
+
66
+ # `problem_type` may already be present in kwargs when loading from
67
+ # a serialized config.json — use setdefault to avoid duplicate kwarg.
68
+ kwargs.setdefault("problem_type", "single_label_classification")
69
+ super().__init__(
70
+ pad_token_id=pad_token_id,
71
+ num_labels=num_labels,
72
+ id2label=id2label,
73
+ label2id=label2id,
74
+ **kwargs,
75
+ )
76
+ self.vocab_size = vocab_size
77
+ self.hidden_size = hidden_size
78
+ self.intermediate_size = intermediate_size
79
+ self.num_hidden_layers = num_hidden_layers
80
+ self.num_attention_heads = num_attention_heads
81
+ self.max_position_embeddings = max_position_embeddings
82
+ self.hidden_dropout_prob = hidden_dropout_prob
example.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example usage of the Sentiment Transformer with HuggingFace Transformers.
3
+
4
+ This file is included in every HF export directory as a quick-start reference.
5
+
6
+ Usage::
7
+
8
+ python example.py
9
+ python example.py --text "This movie was incredible!"
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import sys
16
+ from pathlib import Path
17
+
18
+
19
+ def main() -> None:
20
+ parser = argparse.ArgumentParser(
21
+ description="Quick-start example for the Sentiment Transformer.",
22
+ )
23
+ parser.add_argument(
24
+ "--text",
25
+ type=str,
26
+ default=None,
27
+ help="Single text to classify. If omitted, runs built-in examples.",
28
+ )
29
+ parser.add_argument(
30
+ "--model-dir",
31
+ type=str,
32
+ default=str(Path(__file__).resolve().parent),
33
+ help="Path to the HF model directory. Defaults to this file's directory.",
34
+ )
35
+ args = parser.parse_args()
36
+
37
+ try:
38
+ from transformers import (
39
+ AutoModelForSequenceClassification,
40
+ AutoTokenizer,
41
+ pipeline,
42
+ )
43
+ except ImportError:
44
+ print("ERROR: `transformers` is required. Install with:")
45
+ print(" pip install transformers torch")
46
+ sys.exit(1)
47
+
48
+ print(f"Loading model from: {args.model_dir}")
49
+ model = AutoModelForSequenceClassification.from_pretrained(
50
+ args.model_dir, trust_remote_code=True,
51
+ )
52
+ tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
53
+ pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
54
+
55
+ print(f"Model: {type(model).__name__}")
56
+ print(f"Labels: {model.config.id2label}")
57
+ print()
58
+
59
+ if args.text:
60
+ texts = [args.text]
61
+ else:
62
+ texts = [
63
+ "This movie was absolutely fantastic! I loved every minute of it.",
64
+ "Terrible film, completely unwatchable garbage.",
65
+ "The movie was okay, nothing special really.",
66
+ "An outstanding performance by the entire cast.",
67
+ "I fell asleep halfway through. Waste of time.",
68
+ ]
69
+
70
+ results = pipe(texts)
71
+ for text, result in zip(texts, results):
72
+ label = result["label"]
73
+ score = result["score"]
74
+ print(f" {label:8s} ({score:.4f}) {text}")
75
+
76
+ # Top-k example
77
+ print("\n--- Top-k prediction ---")
78
+ sample = texts[0]
79
+ top_k = pipe(sample, top_k=None)
80
+ print(f" \"{sample[:60]}...\"")
81
+ for r in top_k:
82
+ bar = "█" * int(r["score"] * 40)
83
+ print(f" {r['label']:8s} {r['score']:.4f} {bar}")
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1f663368d88b5e829a71ce47cd55fefd8ae32fb52fc7b328cf58b2e86ca838
3
+ size 35684012
modeling_sentiment_transformer.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face model definition for the Sentiment Transformer.
3
+
4
+ This file is **self-contained** — it depends only on ``torch`` and
5
+ ``transformers``. It is copied verbatim into every HF export directory
6
+ so that ``AutoModelForSequenceClassification.from_pretrained()`` works
7
+ with ``trust_remote_code=True``.
8
+
9
+ Architecture
10
+ ------------
11
+ Token Embedding + RoPE (Rotary Positional Embedding)
12
+ -> N x TransformerEncoderBlock (pre-layer-norm, SwiGLU FFN)
13
+ -> Final LayerNorm
14
+ -> Mean pooling (masked)
15
+ -> 2-layer MLP classification head (num_labels-class logits)
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+
24
+ from transformers import PreTrainedModel
25
+ from transformers.modeling_outputs import SequenceClassifierOutput
26
+
27
+ from configuration_sentiment_transformer import SentimentTransformerConfig
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Rotary Positional Embedding (RoPE)
32
+ # ---------------------------------------------------------------------------
33
+
34
+ class RotaryEmbedding(nn.Module):
35
+ """Precompute and cache the sin/cos frequencies for RoPE.
36
+
37
+ RoPE encodes absolute position through *rotation* applied to pairs of
38
+ dimensions in Q and K. This gives the dot-product between Q_i and K_j
39
+ a natural dependence on relative position (i - j) without any learnable
40
+ parameters.
41
+ """
42
+
43
+ def __init__(self, head_dim: int, max_seq_len: int, base: float = 10000.0) -> None:
44
+ super().__init__()
45
+ assert head_dim % 2 == 0, "head_dim must be even for RoPE"
46
+
47
+ inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
48
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
49
+
50
+ t = torch.arange(max_seq_len).float()
51
+ freqs = torch.outer(t, inv_freq)
52
+ self.register_buffer("cos_cached", freqs.cos(), persistent=False)
53
+ self.register_buffer("sin_cached", freqs.sin(), persistent=False)
54
+
55
+ def forward(self, seq_len: int) -> tuple[torch.Tensor, torch.Tensor]:
56
+ """Return (cos, sin) each of shape (seq_len, head_dim // 2)."""
57
+ return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
58
+
59
+
60
+ def _apply_rope(
61
+ x: torch.Tensor,
62
+ cos: torch.Tensor,
63
+ sin: torch.Tensor,
64
+ ) -> torch.Tensor:
65
+ """Apply rotary embedding to a Q or K tensor.
66
+
67
+ Parameters
68
+ ----------
69
+ x : Tensor, shape ``(B, num_heads, S, head_dim)``
70
+ cos, sin : Tensor, shape ``(S, head_dim // 2)``
71
+
72
+ Returns
73
+ -------
74
+ Tensor, same shape as ``x``.
75
+ """
76
+ x1 = x[..., 0::2] # even indices
77
+ x2 = x[..., 1::2] # odd indices
78
+
79
+ cos = cos.unsqueeze(0).unsqueeze(0)
80
+ sin = sin.unsqueeze(0).unsqueeze(0)
81
+
82
+ out1 = x1 * cos - x2 * sin
83
+ out2 = x1 * sin + x2 * cos
84
+
85
+ return torch.stack((out1, out2), dim=-1).flatten(-2)
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # Building blocks
90
+ # ---------------------------------------------------------------------------
91
+
92
+ class MultiHeadSelfAttention(nn.Module):
93
+ """Multi-head self-attention with RoPE and fused SDPA kernel.
94
+
95
+ Automatically dispatches to FlashAttention or Memory-Efficient
96
+ Attention when running on a compatible GPU.
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ hidden_dim: int,
102
+ num_heads: int,
103
+ dropout: float,
104
+ rope: RotaryEmbedding,
105
+ ) -> None:
106
+ super().__init__()
107
+ assert hidden_dim % num_heads == 0, (
108
+ f"hidden_dim ({hidden_dim}) must be divisible by num_heads ({num_heads})"
109
+ )
110
+ self.num_heads = num_heads
111
+ self.head_dim = hidden_dim // num_heads
112
+ self.dropout = dropout
113
+ self.rope = rope
114
+
115
+ self.q_proj = nn.Linear(hidden_dim, hidden_dim)
116
+ self.k_proj = nn.Linear(hidden_dim, hidden_dim)
117
+ self.v_proj = nn.Linear(hidden_dim, hidden_dim)
118
+ self.out_proj = nn.Linear(hidden_dim, hidden_dim)
119
+
120
+ def forward(
121
+ self,
122
+ x: torch.Tensor,
123
+ attention_mask: torch.Tensor | None = None,
124
+ ) -> torch.Tensor:
125
+ B, S, H = x.shape
126
+
127
+ q = self.q_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
128
+ k = self.k_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
129
+ v = self.v_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
130
+
131
+ # Apply RoPE to Q and K
132
+ cos, sin = self.rope(S)
133
+ q = _apply_rope(q, cos, sin)
134
+ k = _apply_rope(k, cos, sin)
135
+
136
+ attn_mask = None
137
+ if attention_mask is not None:
138
+ attn_mask = attention_mask.bool().unsqueeze(1).unsqueeze(2)
139
+
140
+ attn_out = F.scaled_dot_product_attention(
141
+ q, k, v,
142
+ attn_mask=attn_mask,
143
+ dropout_p=self.dropout if self.training else 0.0,
144
+ )
145
+
146
+ attn_out = attn_out.transpose(1, 2).contiguous().view(B, S, H)
147
+ return self.out_proj(attn_out)
148
+
149
+
150
+ class SwiGLUFeedForward(nn.Module):
151
+ """SwiGLU feed-forward network (as used in LLaMA / Gemma).
152
+
153
+ SwiGLU(x) = W_down · (SiLU(W_gate · x) ⊙ W_up · x)
154
+ """
155
+
156
+ def __init__(self, hidden_dim: int, ffn_dim: int, dropout: float) -> None:
157
+ super().__init__()
158
+ inner_dim = int(2 / 3 * ffn_dim)
159
+ inner_dim = ((inner_dim + 7) // 8) * 8 # round up to multiple of 8
160
+
161
+ self.w_gate = nn.Linear(hidden_dim, inner_dim, bias=False)
162
+ self.w_up = nn.Linear(hidden_dim, inner_dim, bias=False)
163
+ self.w_down = nn.Linear(inner_dim, hidden_dim, bias=False)
164
+ self.dropout = nn.Dropout(dropout)
165
+
166
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
167
+ return self.dropout(self.w_down(F.silu(self.w_gate(x)) * self.w_up(x)))
168
+
169
+
170
+ class TransformerEncoderBlock(nn.Module):
171
+ """Single transformer encoder block with **pre-layer-norm** and SwiGLU.
172
+
173
+ Pre-LN applies LayerNorm *before* each sub-layer:
174
+
175
+ x = x + Attention(LayerNorm(x))
176
+ x = x + SwiGLU_FFN(LayerNorm(x))
177
+ """
178
+
179
+ def __init__(
180
+ self,
181
+ hidden_dim: int,
182
+ num_heads: int,
183
+ ffn_dim: int,
184
+ dropout: float,
185
+ rope: RotaryEmbedding,
186
+ ) -> None:
187
+ super().__init__()
188
+ self.norm1 = nn.LayerNorm(hidden_dim)
189
+ self.attn = MultiHeadSelfAttention(hidden_dim, num_heads, dropout, rope)
190
+ self.norm2 = nn.LayerNorm(hidden_dim)
191
+ self.ffn = SwiGLUFeedForward(hidden_dim, ffn_dim, dropout)
192
+ self.dropout = nn.Dropout(dropout)
193
+
194
+ def forward(
195
+ self,
196
+ x: torch.Tensor,
197
+ attention_mask: torch.Tensor | None = None,
198
+ ) -> torch.Tensor:
199
+ x = x + self.dropout(self.attn(self.norm1(x), attention_mask))
200
+ x = x + self.dropout(self.ffn(self.norm2(x)))
201
+ return x
202
+
203
+
204
+ class SentimentTransformerBackbone(nn.Module):
205
+ """Transformer encoder for sentiment classification.
206
+
207
+ Uses mean pooling over non-padding tokens and a 2-layer MLP
208
+ classification head. Returns raw logits (no softmax).
209
+ """
210
+
211
+ def __init__(
212
+ self,
213
+ vocab_size: int,
214
+ hidden_dim: int,
215
+ ffn_dim: int,
216
+ num_layers: int,
217
+ num_heads: int,
218
+ max_seq_len: int,
219
+ num_classes: int,
220
+ dropout: float = 0.1,
221
+ ) -> None:
222
+ super().__init__()
223
+ self.token_embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
224
+ self.embedding_dropout = nn.Dropout(dropout)
225
+
226
+ # Shared RoPE module
227
+ head_dim = hidden_dim // num_heads
228
+ self.rope = RotaryEmbedding(head_dim, max_seq_len)
229
+
230
+ self.layers = nn.ModuleList([
231
+ TransformerEncoderBlock(
232
+ hidden_dim=hidden_dim,
233
+ num_heads=num_heads,
234
+ ffn_dim=ffn_dim,
235
+ dropout=dropout,
236
+ rope=self.rope,
237
+ )
238
+ for _ in range(num_layers)
239
+ ])
240
+
241
+ self.final_norm = nn.LayerNorm(hidden_dim)
242
+
243
+ # 2-layer MLP classification head
244
+ self.classifier = nn.Sequential(
245
+ nn.Linear(hidden_dim, hidden_dim),
246
+ nn.GELU(),
247
+ nn.Dropout(dropout),
248
+ nn.Linear(hidden_dim, num_classes),
249
+ )
250
+ self._init_weights()
251
+
252
+ def _init_weights(self) -> None:
253
+ """Xavier-uniform for linear layers, normal for embeddings."""
254
+ for module in self.modules():
255
+ if isinstance(module, nn.Linear):
256
+ nn.init.xavier_uniform_(module.weight)
257
+ if module.bias is not None:
258
+ nn.init.zeros_(module.bias)
259
+ elif isinstance(module, nn.Embedding):
260
+ nn.init.normal_(module.weight, mean=0.0, std=0.02)
261
+ if module.padding_idx is not None:
262
+ with torch.no_grad():
263
+ module.weight[module.padding_idx].fill_(0)
264
+ elif isinstance(module, nn.LayerNorm):
265
+ nn.init.ones_(module.weight)
266
+ nn.init.zeros_(module.bias)
267
+
268
+ def forward(
269
+ self,
270
+ input_ids: torch.Tensor,
271
+ attention_mask: torch.Tensor,
272
+ ) -> torch.Tensor:
273
+ B, S = input_ids.shape
274
+
275
+ # Token embeddings only — positional information injected via RoPE
276
+ x = self.embedding_dropout(self.token_embedding(input_ids))
277
+
278
+ for layer in self.layers:
279
+ x = layer(x, attention_mask)
280
+
281
+ x = self.final_norm(x)
282
+
283
+ # Mean pooling over non-padding tokens
284
+ mask = attention_mask.unsqueeze(-1).float() # (B, S, 1)
285
+ pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9) # (B, H)
286
+
287
+ logits = self.classifier(pooled)
288
+ return logits
289
+
290
+
291
+ # ---------------------------------------------------------------------------
292
+ # HuggingFace PreTrainedModel wrapper
293
+ # ---------------------------------------------------------------------------
294
+
295
+ class SentimentTransformerForSequenceClassification(PreTrainedModel):
296
+ """HuggingFace-compatible sequence classification wrapper.
297
+
298
+ This class bridges the custom transformer backbone with the HF
299
+ ecosystem. It accepts the standard ``input_ids``, ``attention_mask``,
300
+ and ``labels`` arguments and returns a
301
+ :class:`~transformers.modeling_outputs.SequenceClassifierOutput`.
302
+
303
+ Usage::
304
+
305
+ from transformers import AutoModelForSequenceClassification, pipeline
306
+
307
+ model = AutoModelForSequenceClassification.from_pretrained(
308
+ "path/to/export", trust_remote_code=True
309
+ )
310
+ pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
311
+ pipe("This movie was amazing!")
312
+ """
313
+
314
+ config_class = SentimentTransformerConfig
315
+ base_model_prefix = "backbone"
316
+ main_input_name = "input_ids"
317
+
318
+ def __init__(self, config: SentimentTransformerConfig) -> None:
319
+ super().__init__(config)
320
+ self.backbone = SentimentTransformerBackbone(
321
+ vocab_size=config.vocab_size,
322
+ hidden_dim=config.hidden_size,
323
+ ffn_dim=config.intermediate_size,
324
+ num_layers=config.num_hidden_layers,
325
+ num_heads=config.num_attention_heads,
326
+ max_seq_len=config.max_position_embeddings,
327
+ num_classes=config.num_labels,
328
+ dropout=config.hidden_dropout_prob,
329
+ )
330
+ self.post_init()
331
+
332
+ def forward(
333
+ self,
334
+ input_ids: torch.Tensor | None = None,
335
+ attention_mask: torch.Tensor | None = None,
336
+ labels: torch.Tensor | None = None,
337
+ return_dict: bool | None = None,
338
+ **_kwargs,
339
+ ) -> SequenceClassifierOutput | tuple[torch.Tensor, ...]:
340
+ """Run sequence classification and return HF-style outputs."""
341
+ if input_ids is None:
342
+ raise ValueError("`input_ids` is required.")
343
+ if attention_mask is None:
344
+ attention_mask = torch.ones_like(input_ids)
345
+
346
+ logits = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
347
+
348
+ loss = None
349
+ if labels is not None:
350
+ loss = F.cross_entropy(logits, labels)
351
+
352
+ use_return_dict = (
353
+ return_dict if return_dict is not None else self.config.return_dict
354
+ )
355
+ if not use_return_dict:
356
+ output = (logits,)
357
+ return ((loss,) + output) if loss is not None else output
358
+
359
+ return SequenceClassifierOutput(loss=loss, logits=logits)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "max_length": 256,
5
+ "model_max_length": 256,
6
+ "pad_to_multiple_of": null,
7
+ "pad_token": "[PAD]",
8
+ "pad_token_type_id": 0,
9
+ "padding_side": "right",
10
+ "sep_token": "[SEP]",
11
+ "stride": 0,
12
+ "tokenizer_class": "TokenizersBackend",
13
+ "truncation_side": "right",
14
+ "truncation_strategy": "longest_first",
15
+ "unk_token": "[UNK]"
16
+ }