Spaces:

timagonch
/

algospeak-classifier

Running

timagonch Claude Sonnet 4.6 commited on 22 days ago

Commit

f11fe9d

1 Parent(s): 35dc479

Update to four_class best model: tau=0.15, expanded project overview

- Fix load_model() to use hf_hub_download (was using local paths)
- Update class label: Offensive Language → Obscene Language
- Update temperature: 0.07 → 0.15 in config and inference
- Add easter egg popover with full project history and model stats
- Pass temperature from config to classify_text (was hardcoded 0.1)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

app.py +171 -18
poc/config.yaml +2 -2
poc/src/inference.py +4 -4

app.py CHANGED Viewed

@@ -28,15 +28,135 @@ MODEL_REPO = "timagonch/algospeak-classifier-model"
 LOG_REPO   = "timagonch/algospeak-logs"
 LOG_DIR    = BASE_DIR / "logs"
 LOG_FILE   = LOG_DIR / "predictions.csv"
-LOG_COLS   = ["text", "predicted_label", "score_allowed", "score_offensive", "score_mature", "score_algospeak", "timestamp"]
 CLASS_COLORS = {
-    "Allowed":            "green",
-    "Offensive Language": "red",
-    "Mature Content":     "orange",
-    "Algospeak":          "violet",
 }
 @st.cache_resource(show_spinner="Loading model...")
 def load_model():
@@ -57,7 +177,6 @@ def load_model():
 def get_scheduler():
     import shutil
     LOG_DIR.mkdir(exist_ok=True)
-    # Pull existing log from HF on startup so we append instead of overwrite
     try:
         existing = hf_hub_download(
             repo_id=LOG_REPO,
@@ -66,7 +185,7 @@ def get_scheduler():
         )
         shutil.copy(existing, LOG_FILE)
     except Exception:
-        pass  # no log yet, start fresh
     return CommitScheduler(
         repo_id=LOG_REPO,
         repo_type="dataset",
@@ -80,13 +199,13 @@ def log_prediction(text, result):
     scheduler = get_scheduler()
     scores = result["scores"]
     row = {
-        "text":              text,
-        "predicted_label":   result["predicted_label"],
-        "score_allowed":     round(scores["Allowed"], 4),
-        "score_offensive":   round(scores["Offensive Language"], 4),
-        "score_mature":      round(scores["Mature Content"], 4),
-        "score_algospeak":   round(scores["Algospeak"], 4),
-        "timestamp":         datetime.utcnow().isoformat(),
     }
     with scheduler.lock:
         write_header = not LOG_FILE.exists()
@@ -98,17 +217,51 @@ def log_prediction(text, result):
 # ─────────────────────────────────────────────────────────────────────
-# UI
 # ─────────────────────────────────────────────────────────────────────
-st.title("Algospeak Classifier")
-st.caption("Dual BERTweet model · type a social media post to classify it.")
 text = st.text_area("Post text", height=120, placeholder="Type something here...")
 if st.button("Classify", type="primary") and text.strip():
     encoder, prototypes, tokenizer, cfg, device = load_model()
-    result = classify_text(text, encoder, prototypes, tokenizer, cfg["max_length"], device)
     label = result["predicted_label"]
     color = CLASS_COLORS[label]

 LOG_REPO   = "timagonch/algospeak-logs"
 LOG_DIR    = BASE_DIR / "logs"
 LOG_FILE   = LOG_DIR / "predictions.csv"
+LOG_COLS   = ["text", "predicted_label", "score_allowed", "score_obscene", "score_mature", "score_algospeak", "timestamp"]
 CLASS_COLORS = {
+    "Allowed":          "green",
+    "Obscene Language": "red",
+    "Mature Content":   "orange",
+    "Algospeak":        "violet",
 }
+ABOUT_MD = """
+## Algospeak Classifier — Project Overview
+This tool is the result of a semester-long research project exploring **algospeak detection** as part of a content moderation pipeline for social media. The goal was to classify posts not just by whether they contain harmful content, but by *how* that content is expressed — including coded language specifically designed to evade automated filters.
+---
+### What is Algospeak?
+Algospeak is a form of linguistic camouflage that emerged organically on platforms like TikTok, Bluesky, and Twitter/X. When users learn that certain words trigger automated takedowns, they develop workarounds — substitutions that carry the same meaning but bypass keyword filters:
+- **"unalive"** instead of suicide or self-harm
+- **"corn"** for explicit sexual content
+- **"k!ll", "k1ll", "k.i.l.l"** for violence
+- Phonetic swaps (e.g. "seggs"), emoji substitutions, abbreviations, repurposed innocent words
+The challenge is that these substitutions evolve constantly, vary by community, and are nearly impossible to keep up with using hand-crafted rules. The only durable solution is a model that understands *intent* from context.
+---
+### Architecture
+The model is a **Dual BERTweet** network — two separate BERTweet encoders (vinai/bertweet-base, 270M parameters each) trained jointly with a contrastive learning objective called Supervised InfoNCE:
+- **Supervised encoder** — receives label-prefixed text during training (e.g. `"Algospeak: gonna unalive myself"`). Acts as a teacher by injecting class identity directly into the text.
+- **Unsupervised encoder** — receives raw text only, and is trained to match the supervised encoder's embedding space via the InfoNCE loss.
+After training, the supervised encoder is discarded entirely. At inference, the unsupervised encoder embeds an incoming post and compares it via cosine similarity against four **class prototypes** — the average embedding per class computed from the training set. The nearest prototype wins. The algospeak prototype uses inverse deny-term frequency weighting so rarer coded forms aren't drowned out by common ones.
+This approach was chosen specifically because it requires no rulesets, no exemplar lookup, and no deny list at inference time — just a single forward pass and a dot product.
+---
+### Data Collection & Manual Reclassification
+The dataset was built from Bluesky social media posts collected by the team. Raw posts came in with initial labels, but those labels were noisy — so a careful manual re-review pass was done across the dataset.
+To improve consistency on the class 1 and 2 boundary, **two deny lists** were built:
+- `deny_list_class1.txt` — 115 terms covering slurs and hate speech
+- `deny_list_class2.txt` — 521 terms covering explicit sexual content, drugs, and violence
+A reclassification script applied deny-list hit logic: if a post contained a term from a list and had been labeled in the wrong class, it was overridden. This pass changed ~25,000 labels across the dataset, producing a cleaner `reclassified_final.csv` as the new source of truth.
+---
+### Synthetic Algospeak Generation
+Class 3 (Algospeak) was by far the hardest class to collect naturally. Real algospeak examples are sparse and inconsistently labeled. To address this, a **GPT-4-turbo generation pipeline** was built that takes class 1 and 2 posts and transforms them into algospeak equivalents.
+The pipeline used a 7-technique taxonomy grounded in documented community behavior:
+character substitution, phonetic swaps, pictorial (emoji), abbreviation, repurposing of innocent words, paraphrase, and known community-specific terms. Each term was assigned a technique only if there was a documented example in a hints file — preventing the model from hallucinating plausible-but-wrong substitutions. A deny-term inflection detector ensured that forms like "stabbing" (not just "stab") were correctly passed to the generator.
+This produced **13,264 algospeak pairs** (original + transformed), with the original post always kept in the same split as its algospeak counterpart to prevent leakage.
+---
+### Training Progression
+The model went through several iterations as the dataset and architecture evolved:
+**~10k/class — first dual BERTweet run (Apr 6)**
+The 414-rule exemplar system was abandoned and replaced with the dual BERTweet architecture. The first full run used ~10,000 posts per class from the cleaned dataset, with a simple random split. Result: **test accuracy 79.9%**.
+**~13k/class — group-aware split added (Apr 12)**
+The dataset grew to ~13,300 posts per class using the full synthetic pairs. Critically, a **group-aware split** was introduced: original posts and their algospeak counterparts are always assigned to the same split. Without this, the model can train on a post and be evaluated on a near-identical transformed version — inflating results. With it: **test accuracy 85.9%**.
+**~13k/class — weighted prototype + fix (Apr 13)**
+The algospeak class prototype was upgraded to use inverse deny-term frequency weighting, giving rarer substitution forms more influence on the prototype center. A data loader fix was also applied. Result: **test accuracy 89.4%** — the best result on the full dataset.
+**LLM audit & reclassification (Apr 16)**
+A GPT-4o-mini audit reclassified ~39,000 posts from the existing splits. The LLM had stricter criteria for class 2 (Mature Content), which collapsed many borderline posts into class 0. This reduced class 2 to ~3,300 posts — a sharp drop from 13k — and the new splits had to be rebalanced much smaller. Result: **test accuracy 76.5%**. The bottleneck had shifted to class 2.
+**3-class experiment (Apr 16)**
+As a parallel track, classes 1 and 2 were merged into a single "Harmful Content" class, reducing the problem to 3 classes. With fewer boundaries to learn, the model performed strongly: **test accuracy 89.2%, Algospeak F1 = 93.8%**. This confirmed the architecture works well — the difficulty is class 1 vs 2 separation.
+---
+### Four-Class Controlled Experiment (This Model)
+With the full dataset constrained by class 2 data scarcity, a focused experiment was run using a cleaner, smaller, more carefully curated subset of ~874 posts per class. The synthetic generation pipeline was rerun with stricter controls, producing 429 new algospeak examples. Two deny lists were merged into a single experiment-local list to avoid cross-contamination between class 1 and 2 deny terms.
+#### Temperature Ablation
+Temperature (τ) controls the sharpness of the contrastive loss gradient. Lower τ forces tighter clusters — which risks overfitting on small datasets. Higher τ acts as regularization. Four runs were compared:
+| Run | τ | Test Acc | Macro F1 | Algospeak F1 | Mean AUC |
+|-----|------|----------|----------|--------------|----------|
+| 1 | 0.10 | 0.7918 | 0.7957 | 0.9032 | 0.9452 |
+| 2 | 0.07 | 0.7214 | 0.7256 | 0.8138 | 0.8979 |
+| **3 ✓** | **0.15** | **0.8065** | **0.8083** | **0.9045** | 0.9351 |
+| 4 | 0.20 | 0.8240 | 0.8252 | 0.9161 | 0.9345 |
+Run 4 (τ=0.20) had the best aggregate numbers — but misclassified *"gonna unalive myself fr fr cant take this anymore"* as **Allowed**. That is one of the most well-documented suicide-related algospeak phrases in existence. A false negative on a phrase like that represents a worse failure than a 1.7% drop in overall accuracy, so **τ=0.15 was chosen as the final model**.
+---
+### Final Model — τ = 0.15
+| Metric | Val | Test |
+|---|---|---|
+| Accuracy | 0.8642 | 0.8065 |
+| Macro F1 | 0.8648 | 0.8083 |
+| Mean AUC | 0.9600 | 0.9351 |
+**Per-class test performance:**
+| Class | Precision | Recall | F1 |
+|---|---|---|---|
+| Allowed | 0.8065 | 0.8621 | 0.8333 |
+| Obscene Language | 0.7363 | 0.7701 | 0.7528 |
+| Mature Content | 0.7750 | 0.7126 | 0.7425 |
+| Algospeak | 0.9221 | 0.8875 | **0.9045** |
+Algospeak is the strongest class — which is the point. The remaining error is concentrated at the Obscene Language / Mature Content boundary, where surface vocabulary overlaps significantly (words like "rape" or "shoot" appear in both) and only broader context separates them.
+---
+*Built with BERTweet (VinAI), PyTorch, and Streamlit. Spring 2026.*
+"""
 @st.cache_resource(show_spinner="Loading model...")
 def load_model():
 def get_scheduler():
     import shutil
     LOG_DIR.mkdir(exist_ok=True)
     try:
         existing = hf_hub_download(
             repo_id=LOG_REPO,
         )
         shutil.copy(existing, LOG_FILE)
     except Exception:
+        pass
     return CommitScheduler(
         repo_id=LOG_REPO,
         repo_type="dataset",
     scheduler = get_scheduler()
     scores = result["scores"]
     row = {
+        "text":            text,
+        "predicted_label": result["predicted_label"],
+        "score_allowed":   round(scores["Allowed"], 4),
+        "score_obscene":   round(scores["Obscene Language"], 4),
+        "score_mature":    round(scores["Mature Content"], 4),
+        "score_algospeak": round(scores["Algospeak"], 4),
+        "timestamp":       datetime.utcnow().isoformat(),
     }
     with scheduler.lock:
         write_header = not LOG_FILE.exists()
 # ─────────────────────────────────────────────────────────────────────
+# CSS — makes the easter egg popover button invisible until hovered
 # ─────────────────────────────────────────────────────────────────────
+st.markdown("""
+<style>
+div[data-testid="stPopover"] button {
+    opacity: 0.04;
+    transition: opacity 0.25s ease;
+    font-size: 11px;
+    color: #888;
+    border: none;
+    background: transparent;
+    padding: 2px 6px;
+}
+div[data-testid="stPopover"] button:hover {
+    opacity: 0.55;
+}
+</style>
+""", unsafe_allow_html=True)
+# ─────────────────────────────────────────────────────────────────────
+# Header row — title left, easter egg right
+# ─────────────────────────────────────────────────────────────────────
+title_col, egg_col = st.columns([11, 1])
+with title_col:
+    st.title("Algospeak Classifier")
+    st.caption("Dual BERTweet model · type a social media post to classify it.")
+with egg_col:
+    with st.popover("◉"):
+        st.markdown(ABOUT_MD)
+# ─────────────────────────────────────────────────────────────────────
+# Main UI
+# ─────────────────────────────────────────────────────────────────────
 text = st.text_area("Post text", height=120, placeholder="Type something here...")
 if st.button("Classify", type="primary") and text.strip():
     encoder, prototypes, tokenizer, cfg, device = load_model()
+    result = classify_text(text, encoder, prototypes, tokenizer, cfg["max_length"], device, cfg["temperature"])
     label = result["predicted_label"]
     color = CLASS_COLORS[label]

poc/config.yaml CHANGED Viewed

@@ -4,7 +4,7 @@
 num_classes: 4
 class_labels:
   0: "Allowed"
-  1: "Offensive Language"
   2: "Mature Content"
   3: "Algospeak"
@@ -24,7 +24,7 @@ fp16: true
 gradient_clip: 1.0
 # Loss
-temperature: 0.07
 # Paths (relative to project root)
 train_csv: "data/splits/train.csv"

 num_classes: 4
 class_labels:
   0: "Allowed"
+  1: "Obscene Language"
   2: "Mature Content"
   3: "Algospeak"
 gradient_clip: 1.0
 # Loss
+temperature: 0.15
 # Paths (relative to project root)
 train_csv: "data/splits/train.csv"

poc/src/inference.py CHANGED Viewed

@@ -48,12 +48,12 @@ BASE_DIR = Path(__file__).resolve().parent.parent.parent
 CLASS_PREFIX = {
     0: "Allowed:",
-    1: "Offensive Language:",
     2: "Mature Content:",
     3: "Algospeak:",
 }
-CLASS_NAMES = ["Allowed", "Offensive Language", "Mature Content", "Algospeak"]
 def load_config() -> dict:
@@ -251,7 +251,7 @@ def evaluate_split(
     }
-def classify_text(text: str, encoder, prototypes, tokenizer, max_length, device) -> dict:
     """Classify a single raw text string. Returns predicted class and similarity scores."""
     enc = tokenizer(
         emoji.demojize(text), padding="max_length", truncation=True,
@@ -262,7 +262,7 @@ def classify_text(text: str, encoder, prototypes, tokenizer, max_length, device)
     emb = emb.cpu().numpy()
     sim    = emb @ prototypes.T
-    scores = torch.softmax(torch.tensor(sim / 0.1), dim=-1).numpy()[0]
     pred   = int(sim.argmax())
     return {

 CLASS_PREFIX = {
     0: "Allowed:",
+    1: "Obscene Language:",
     2: "Mature Content:",
     3: "Algospeak:",
 }
+CLASS_NAMES = ["Allowed", "Obscene Language", "Mature Content", "Algospeak"]
 def load_config() -> dict:
     }
+def classify_text(text: str, encoder, prototypes, tokenizer, max_length, device, temperature: float = 0.15) -> dict:
     """Classify a single raw text string. Returns predicted class and similarity scores."""
     enc = tokenizer(
         emoji.demojize(text), padding="max_length", truncation=True,
     emb = emb.cpu().numpy()
     sim    = emb @ prototypes.T
+    scores = torch.softmax(torch.tensor(sim / temperature), dim=-1).numpy()[0]
     pred   = int(sim.argmax())
     return {