timagonch Claude Sonnet 4.6 commited on
Commit
f11fe9d
Β·
1 Parent(s): 35dc479

Update to four_class best model: tau=0.15, expanded project overview

Browse files

- Fix load_model() to use hf_hub_download (was using local paths)
- Update class label: Offensive Language β†’ Obscene Language
- Update temperature: 0.07 β†’ 0.15 in config and inference
- Add easter egg popover with full project history and model stats
- Pass temperature from config to classify_text (was hardcoded 0.1)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +171 -18
  2. poc/config.yaml +2 -2
  3. poc/src/inference.py +4 -4
app.py CHANGED
@@ -28,15 +28,135 @@ MODEL_REPO = "timagonch/algospeak-classifier-model"
28
  LOG_REPO = "timagonch/algospeak-logs"
29
  LOG_DIR = BASE_DIR / "logs"
30
  LOG_FILE = LOG_DIR / "predictions.csv"
31
- LOG_COLS = ["text", "predicted_label", "score_allowed", "score_offensive", "score_mature", "score_algospeak", "timestamp"]
32
 
33
  CLASS_COLORS = {
34
- "Allowed": "green",
35
- "Offensive Language": "red",
36
- "Mature Content": "orange",
37
- "Algospeak": "violet",
38
  }
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  @st.cache_resource(show_spinner="Loading model...")
42
  def load_model():
@@ -57,7 +177,6 @@ def load_model():
57
  def get_scheduler():
58
  import shutil
59
  LOG_DIR.mkdir(exist_ok=True)
60
- # Pull existing log from HF on startup so we append instead of overwrite
61
  try:
62
  existing = hf_hub_download(
63
  repo_id=LOG_REPO,
@@ -66,7 +185,7 @@ def get_scheduler():
66
  )
67
  shutil.copy(existing, LOG_FILE)
68
  except Exception:
69
- pass # no log yet, start fresh
70
  return CommitScheduler(
71
  repo_id=LOG_REPO,
72
  repo_type="dataset",
@@ -80,13 +199,13 @@ def log_prediction(text, result):
80
  scheduler = get_scheduler()
81
  scores = result["scores"]
82
  row = {
83
- "text": text,
84
- "predicted_label": result["predicted_label"],
85
- "score_allowed": round(scores["Allowed"], 4),
86
- "score_offensive": round(scores["Offensive Language"], 4),
87
- "score_mature": round(scores["Mature Content"], 4),
88
- "score_algospeak": round(scores["Algospeak"], 4),
89
- "timestamp": datetime.utcnow().isoformat(),
90
  }
91
  with scheduler.lock:
92
  write_header = not LOG_FILE.exists()
@@ -98,17 +217,51 @@ def log_prediction(text, result):
98
 
99
 
100
  # ─────────────────────────────────────────────────────────────────────
101
- # UI
102
  # ─────────────────────────────────────────────────────────────────────
103
 
104
- st.title("Algospeak Classifier")
105
- st.caption("Dual BERTweet model Β· type a social media post to classify it.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  text = st.text_area("Post text", height=120, placeholder="Type something here...")
108
 
109
  if st.button("Classify", type="primary") and text.strip():
110
  encoder, prototypes, tokenizer, cfg, device = load_model()
111
- result = classify_text(text, encoder, prototypes, tokenizer, cfg["max_length"], device)
112
 
113
  label = result["predicted_label"]
114
  color = CLASS_COLORS[label]
 
28
  LOG_REPO = "timagonch/algospeak-logs"
29
  LOG_DIR = BASE_DIR / "logs"
30
  LOG_FILE = LOG_DIR / "predictions.csv"
31
+ LOG_COLS = ["text", "predicted_label", "score_allowed", "score_obscene", "score_mature", "score_algospeak", "timestamp"]
32
 
33
  CLASS_COLORS = {
34
+ "Allowed": "green",
35
+ "Obscene Language": "red",
36
+ "Mature Content": "orange",
37
+ "Algospeak": "violet",
38
  }
39
 
40
+ ABOUT_MD = """
41
+ ## Algospeak Classifier β€” Project Overview
42
+
43
+ This tool is the result of a semester-long research project exploring **algospeak detection** as part of a content moderation pipeline for social media. The goal was to classify posts not just by whether they contain harmful content, but by *how* that content is expressed β€” including coded language specifically designed to evade automated filters.
44
+
45
+ ---
46
+
47
+ ### What is Algospeak?
48
+
49
+ Algospeak is a form of linguistic camouflage that emerged organically on platforms like TikTok, Bluesky, and Twitter/X. When users learn that certain words trigger automated takedowns, they develop workarounds β€” substitutions that carry the same meaning but bypass keyword filters:
50
+
51
+ - **"unalive"** instead of suicide or self-harm
52
+ - **"corn"** for explicit sexual content
53
+ - **"k!ll", "k1ll", "k.i.l.l"** for violence
54
+ - Phonetic swaps (e.g. "seggs"), emoji substitutions, abbreviations, repurposed innocent words
55
+
56
+ The challenge is that these substitutions evolve constantly, vary by community, and are nearly impossible to keep up with using hand-crafted rules. The only durable solution is a model that understands *intent* from context.
57
+
58
+ ---
59
+
60
+ ### Architecture
61
+
62
+ The model is a **Dual BERTweet** network β€” two separate BERTweet encoders (vinai/bertweet-base, 270M parameters each) trained jointly with a contrastive learning objective called Supervised InfoNCE:
63
+
64
+ - **Supervised encoder** β€” receives label-prefixed text during training (e.g. `"Algospeak: gonna unalive myself"`). Acts as a teacher by injecting class identity directly into the text.
65
+ - **Unsupervised encoder** β€” receives raw text only, and is trained to match the supervised encoder's embedding space via the InfoNCE loss.
66
+
67
+ After training, the supervised encoder is discarded entirely. At inference, the unsupervised encoder embeds an incoming post and compares it via cosine similarity against four **class prototypes** β€” the average embedding per class computed from the training set. The nearest prototype wins. The algospeak prototype uses inverse deny-term frequency weighting so rarer coded forms aren't drowned out by common ones.
68
+
69
+ This approach was chosen specifically because it requires no rulesets, no exemplar lookup, and no deny list at inference time β€” just a single forward pass and a dot product.
70
+
71
+ ---
72
+
73
+ ### Data Collection & Manual Reclassification
74
+
75
+ The dataset was built from Bluesky social media posts collected by the team. Raw posts came in with initial labels, but those labels were noisy β€” so a careful manual re-review pass was done across the dataset.
76
+
77
+ To improve consistency on the class 1 and 2 boundary, **two deny lists** were built:
78
+ - `deny_list_class1.txt` β€” 115 terms covering slurs and hate speech
79
+ - `deny_list_class2.txt` β€” 521 terms covering explicit sexual content, drugs, and violence
80
+
81
+ A reclassification script applied deny-list hit logic: if a post contained a term from a list and had been labeled in the wrong class, it was overridden. This pass changed ~25,000 labels across the dataset, producing a cleaner `reclassified_final.csv` as the new source of truth.
82
+
83
+ ---
84
+
85
+ ### Synthetic Algospeak Generation
86
+
87
+ Class 3 (Algospeak) was by far the hardest class to collect naturally. Real algospeak examples are sparse and inconsistently labeled. To address this, a **GPT-4-turbo generation pipeline** was built that takes class 1 and 2 posts and transforms them into algospeak equivalents.
88
+
89
+ The pipeline used a 7-technique taxonomy grounded in documented community behavior:
90
+ character substitution, phonetic swaps, pictorial (emoji), abbreviation, repurposing of innocent words, paraphrase, and known community-specific terms. Each term was assigned a technique only if there was a documented example in a hints file β€” preventing the model from hallucinating plausible-but-wrong substitutions. A deny-term inflection detector ensured that forms like "stabbing" (not just "stab") were correctly passed to the generator.
91
+
92
+ This produced **13,264 algospeak pairs** (original + transformed), with the original post always kept in the same split as its algospeak counterpart to prevent leakage.
93
+
94
+ ---
95
+
96
+ ### Training Progression
97
+
98
+ The model went through several iterations as the dataset and architecture evolved:
99
+
100
+ **~10k/class β€” first dual BERTweet run (Apr 6)**
101
+ The 414-rule exemplar system was abandoned and replaced with the dual BERTweet architecture. The first full run used ~10,000 posts per class from the cleaned dataset, with a simple random split. Result: **test accuracy 79.9%**.
102
+
103
+ **~13k/class β€” group-aware split added (Apr 12)**
104
+ The dataset grew to ~13,300 posts per class using the full synthetic pairs. Critically, a **group-aware split** was introduced: original posts and their algospeak counterparts are always assigned to the same split. Without this, the model can train on a post and be evaluated on a near-identical transformed version β€” inflating results. With it: **test accuracy 85.9%**.
105
+
106
+ **~13k/class β€” weighted prototype + fix (Apr 13)**
107
+ The algospeak class prototype was upgraded to use inverse deny-term frequency weighting, giving rarer substitution forms more influence on the prototype center. A data loader fix was also applied. Result: **test accuracy 89.4%** β€” the best result on the full dataset.
108
+
109
+ **LLM audit & reclassification (Apr 16)**
110
+ A GPT-4o-mini audit reclassified ~39,000 posts from the existing splits. The LLM had stricter criteria for class 2 (Mature Content), which collapsed many borderline posts into class 0. This reduced class 2 to ~3,300 posts β€” a sharp drop from 13k β€” and the new splits had to be rebalanced much smaller. Result: **test accuracy 76.5%**. The bottleneck had shifted to class 2.
111
+
112
+ **3-class experiment (Apr 16)**
113
+ As a parallel track, classes 1 and 2 were merged into a single "Harmful Content" class, reducing the problem to 3 classes. With fewer boundaries to learn, the model performed strongly: **test accuracy 89.2%, Algospeak F1 = 93.8%**. This confirmed the architecture works well β€” the difficulty is class 1 vs 2 separation.
114
+
115
+ ---
116
+
117
+ ### Four-Class Controlled Experiment (This Model)
118
+
119
+ With the full dataset constrained by class 2 data scarcity, a focused experiment was run using a cleaner, smaller, more carefully curated subset of ~874 posts per class. The synthetic generation pipeline was rerun with stricter controls, producing 429 new algospeak examples. Two deny lists were merged into a single experiment-local list to avoid cross-contamination between class 1 and 2 deny terms.
120
+
121
+ #### Temperature Ablation
122
+
123
+ Temperature (Ο„) controls the sharpness of the contrastive loss gradient. Lower Ο„ forces tighter clusters β€” which risks overfitting on small datasets. Higher Ο„ acts as regularization. Four runs were compared:
124
+
125
+ | Run | Ο„ | Test Acc | Macro F1 | Algospeak F1 | Mean AUC |
126
+ |-----|------|----------|----------|--------------|----------|
127
+ | 1 | 0.10 | 0.7918 | 0.7957 | 0.9032 | 0.9452 |
128
+ | 2 | 0.07 | 0.7214 | 0.7256 | 0.8138 | 0.8979 |
129
+ | **3 βœ“** | **0.15** | **0.8065** | **0.8083** | **0.9045** | 0.9351 |
130
+ | 4 | 0.20 | 0.8240 | 0.8252 | 0.9161 | 0.9345 |
131
+
132
+ Run 4 (Ο„=0.20) had the best aggregate numbers β€” but misclassified *"gonna unalive myself fr fr cant take this anymore"* as **Allowed**. That is one of the most well-documented suicide-related algospeak phrases in existence. A false negative on a phrase like that represents a worse failure than a 1.7% drop in overall accuracy, so **Ο„=0.15 was chosen as the final model**.
133
+
134
+ ---
135
+
136
+ ### Final Model β€” Ο„ = 0.15
137
+
138
+ | Metric | Val | Test |
139
+ |---|---|---|
140
+ | Accuracy | 0.8642 | 0.8065 |
141
+ | Macro F1 | 0.8648 | 0.8083 |
142
+ | Mean AUC | 0.9600 | 0.9351 |
143
+
144
+ **Per-class test performance:**
145
+
146
+ | Class | Precision | Recall | F1 |
147
+ |---|---|---|---|
148
+ | Allowed | 0.8065 | 0.8621 | 0.8333 |
149
+ | Obscene Language | 0.7363 | 0.7701 | 0.7528 |
150
+ | Mature Content | 0.7750 | 0.7126 | 0.7425 |
151
+ | Algospeak | 0.9221 | 0.8875 | **0.9045** |
152
+
153
+ Algospeak is the strongest class β€” which is the point. The remaining error is concentrated at the Obscene Language / Mature Content boundary, where surface vocabulary overlaps significantly (words like "rape" or "shoot" appear in both) and only broader context separates them.
154
+
155
+ ---
156
+
157
+ *Built with BERTweet (VinAI), PyTorch, and Streamlit. Spring 2026.*
158
+ """
159
+
160
 
161
  @st.cache_resource(show_spinner="Loading model...")
162
  def load_model():
 
177
  def get_scheduler():
178
  import shutil
179
  LOG_DIR.mkdir(exist_ok=True)
 
180
  try:
181
  existing = hf_hub_download(
182
  repo_id=LOG_REPO,
 
185
  )
186
  shutil.copy(existing, LOG_FILE)
187
  except Exception:
188
+ pass
189
  return CommitScheduler(
190
  repo_id=LOG_REPO,
191
  repo_type="dataset",
 
199
  scheduler = get_scheduler()
200
  scores = result["scores"]
201
  row = {
202
+ "text": text,
203
+ "predicted_label": result["predicted_label"],
204
+ "score_allowed": round(scores["Allowed"], 4),
205
+ "score_obscene": round(scores["Obscene Language"], 4),
206
+ "score_mature": round(scores["Mature Content"], 4),
207
+ "score_algospeak": round(scores["Algospeak"], 4),
208
+ "timestamp": datetime.utcnow().isoformat(),
209
  }
210
  with scheduler.lock:
211
  write_header = not LOG_FILE.exists()
 
217
 
218
 
219
  # ─────────────────────────────────────────────────────────────────────
220
+ # CSS β€” makes the easter egg popover button invisible until hovered
221
  # ─────────────────────────────────────────────────────────────────────
222
 
223
+ st.markdown("""
224
+ <style>
225
+ div[data-testid="stPopover"] button {
226
+ opacity: 0.04;
227
+ transition: opacity 0.25s ease;
228
+ font-size: 11px;
229
+ color: #888;
230
+ border: none;
231
+ background: transparent;
232
+ padding: 2px 6px;
233
+ }
234
+ div[data-testid="stPopover"] button:hover {
235
+ opacity: 0.55;
236
+ }
237
+ </style>
238
+ """, unsafe_allow_html=True)
239
+
240
+
241
+ # ─────────────────────────────────────────────────────────────────────
242
+ # Header row β€” title left, easter egg right
243
+ # ─────────────────────────────────────────────────────────────────────
244
+
245
+ title_col, egg_col = st.columns([11, 1])
246
+
247
+ with title_col:
248
+ st.title("Algospeak Classifier")
249
+ st.caption("Dual BERTweet model Β· type a social media post to classify it.")
250
+
251
+ with egg_col:
252
+ with st.popover("β—‰"):
253
+ st.markdown(ABOUT_MD)
254
+
255
+
256
+ # ─────────────────────────────────────────────────────────────────────
257
+ # Main UI
258
+ # ─────────────────────────────────────────────────────────────────────
259
 
260
  text = st.text_area("Post text", height=120, placeholder="Type something here...")
261
 
262
  if st.button("Classify", type="primary") and text.strip():
263
  encoder, prototypes, tokenizer, cfg, device = load_model()
264
+ result = classify_text(text, encoder, prototypes, tokenizer, cfg["max_length"], device, cfg["temperature"])
265
 
266
  label = result["predicted_label"]
267
  color = CLASS_COLORS[label]
poc/config.yaml CHANGED
@@ -4,7 +4,7 @@
4
  num_classes: 4
5
  class_labels:
6
  0: "Allowed"
7
- 1: "Offensive Language"
8
  2: "Mature Content"
9
  3: "Algospeak"
10
 
@@ -24,7 +24,7 @@ fp16: true
24
  gradient_clip: 1.0
25
 
26
  # Loss
27
- temperature: 0.07
28
 
29
  # Paths (relative to project root)
30
  train_csv: "data/splits/train.csv"
 
4
  num_classes: 4
5
  class_labels:
6
  0: "Allowed"
7
+ 1: "Obscene Language"
8
  2: "Mature Content"
9
  3: "Algospeak"
10
 
 
24
  gradient_clip: 1.0
25
 
26
  # Loss
27
+ temperature: 0.15
28
 
29
  # Paths (relative to project root)
30
  train_csv: "data/splits/train.csv"
poc/src/inference.py CHANGED
@@ -48,12 +48,12 @@ BASE_DIR = Path(__file__).resolve().parent.parent.parent
48
 
49
  CLASS_PREFIX = {
50
  0: "Allowed:",
51
- 1: "Offensive Language:",
52
  2: "Mature Content:",
53
  3: "Algospeak:",
54
  }
55
 
56
- CLASS_NAMES = ["Allowed", "Offensive Language", "Mature Content", "Algospeak"]
57
 
58
 
59
  def load_config() -> dict:
@@ -251,7 +251,7 @@ def evaluate_split(
251
  }
252
 
253
 
254
- def classify_text(text: str, encoder, prototypes, tokenizer, max_length, device) -> dict:
255
  """Classify a single raw text string. Returns predicted class and similarity scores."""
256
  enc = tokenizer(
257
  emoji.demojize(text), padding="max_length", truncation=True,
@@ -262,7 +262,7 @@ def classify_text(text: str, encoder, prototypes, tokenizer, max_length, device)
262
  emb = emb.cpu().numpy()
263
 
264
  sim = emb @ prototypes.T
265
- scores = torch.softmax(torch.tensor(sim / 0.1), dim=-1).numpy()[0]
266
  pred = int(sim.argmax())
267
 
268
  return {
 
48
 
49
  CLASS_PREFIX = {
50
  0: "Allowed:",
51
+ 1: "Obscene Language:",
52
  2: "Mature Content:",
53
  3: "Algospeak:",
54
  }
55
 
56
+ CLASS_NAMES = ["Allowed", "Obscene Language", "Mature Content", "Algospeak"]
57
 
58
 
59
  def load_config() -> dict:
 
251
  }
252
 
253
 
254
+ def classify_text(text: str, encoder, prototypes, tokenizer, max_length, device, temperature: float = 0.15) -> dict:
255
  """Classify a single raw text string. Returns predicted class and similarity scores."""
256
  enc = tokenizer(
257
  emoji.demojize(text), padding="max_length", truncation=True,
 
262
  emb = emb.cpu().numpy()
263
 
264
  sim = emb @ prototypes.T
265
+ scores = torch.softmax(torch.tensor(sim / temperature), dim=-1).numpy()[0]
266
  pred = int(sim.argmax())
267
 
268
  return {