Spaces:

Divyonko
/

LivePulse

Running

App Files Files Community

DivYonko commited on 18 days ago

Commit

b9d31ba

1 Parent(s): 1baaf30

Replace ML ensemble with pure keyword sentiment engine

Browse files

Files changed (4) hide show

Dockerfile +0 -3
README.md +7 -5
ml/sentiment_model.py +373 -264
requirements.txt +1 -6

Dockerfile CHANGED Viewed

@@ -13,9 +13,6 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
-# Suppress Streamlit's file watcher scanning transformers (harmless but noisy)
-ENV STREAMLIT_SERVER_FILE_WATCHER_TYPE=none
 EXPOSE 7860
 HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health

 COPY . .
 EXPOSE 7860
 HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health

README.md CHANGED Viewed

@@ -15,12 +15,14 @@ Real-time Hinglish sentiment and topic analysis for YouTube live streams.
 ## Features
-- Real-time chat scraping via pytchat
-- Sentiment classification (Positive / Neutral / Negative) using a 3-model ensemble
-  - Fine-tuned MuRIL (Hinglish-aware)
-  - XLM-RoBERTa (multilingual Twitter model)
-  - Multilingual sentiment model
 - Topic classification (Appreciation / Question / Promo / Spam / MCQ Answer / General)
 - Interactive Streamlit dashboard with live auto-refresh
 - Start/stop scraper directly from the UI
 - Multi-stream comparison (up to 5 streams)

 ## Features
+- Real-time chat scraping via YouTube Data API v3
+- Sentiment classification (Positive / Neutral / Negative) using a pure keyword engine
+  - Expanded Hinglish + English + regional slang keyword sets
+  - Negation handling ("nahi accha" → Negative)
+  - Intensifier boost ("bahut accha" → higher confidence)
+  - Emoji sentiment scoring
 - Topic classification (Appreciation / Question / Promo / Spam / MCQ Answer / General)
+- Action type classification (28 fine-grained categories, fully keyword-based)
 - Interactive Streamlit dashboard with live auto-refresh
 - Start/stop scraper directly from the UI
 - Multi-stream comparison (up to 5 streams)

ml/sentiment_model.py CHANGED Viewed

@@ -1,305 +1,414 @@
 from __future__ import annotations
 import re
-import threading
-from functools import lru_cache
 import emoji
-import torch
-import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# ── Model paths ────────────────────────────────────────────────────────────────
-MURIL_MODEL    = "./new_trained_data/muril-sentimix"
-XLMR_MODEL     = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
-MULTI_MODEL    = "tabularisai/multilingual-sentiment-analysis"
-LABELS = ["Negative", "Neutral", "Positive"]
-# Weights
-MURIL_WEIGHT = 0.40
-XLMR_WEIGHT  = 0.35
-MULTI_WEIGHT = 0.25
-# ── Lazy loading ───────────────────────────────────────────────────────────────
-_lock = threading.Lock()
-_muril_tokenizer = _muril_model = None
-_xlmr_tokenizer  = _xlmr_model  = None
-_multi_tokenizer = _multi_model = None
-_models_loaded   = False
-_load_error: Exception | None = None
-def _load_models():
-    global _muril_tokenizer, _muril_model
-    global _xlmr_tokenizer, _xlmr_model
-    global _multi_tokenizer, _multi_model
-    global _models_loaded, _load_error
-    if _models_loaded:
-        return
-    with _lock:
-        if _models_loaded:
-            return
-        print("[sentiment] Loading models...")
-        try:
-            _muril_tokenizer = AutoTokenizer.from_pretrained(MURIL_MODEL)
-            _muril_model     = AutoModelForSequenceClassification.from_pretrained(MURIL_MODEL)
-            print(f"[sentiment] MuRIL loaded — id2label: {_muril_model.config.id2label}")
-            _xlmr_tokenizer = AutoTokenizer.from_pretrained(XLMR_MODEL)
-            _xlmr_model     = AutoModelForSequenceClassification.from_pretrained(XLMR_MODEL)
-            print(f"[sentiment] XLM-R loaded — id2label: {_xlmr_model.config.id2label}")
-            _multi_tokenizer = AutoTokenizer.from_pretrained(MULTI_MODEL)
-            _multi_model     = AutoModelForSequenceClassification.from_pretrained(MULTI_MODEL)
-            print(f"[sentiment] Multilingual loaded — id2label: {_multi_model.config.id2label}")
-            _muril_model.eval()
-            _xlmr_model.eval()
-            _multi_model.eval()
-            if torch.cuda.is_available():
-                _muril_model.to("cuda")
-                _xlmr_model.to("cuda")
-                _multi_model.to("cuda")
-            _models_loaded = True
-            print("[sentiment] All models ready ✓")
-        except Exception as exc:
-            _load_error = exc
-            print(f"[sentiment] ERROR loading models: {exc}")
-            raise
-def _device():
-    if not _models_loaded:
-        _load_models()
-    return next(_muril_model.parameters()).device
-# ── Text normalization ─────────────────────────────────────────────────────────
-def _normalize_repeated_chars(text: str) -> str:
-    return re.sub(r"(.)\1{2,}", r"\1\1", text)
 # ── Emoji scoring ──────────────────────────────────────────────────────────────
-_POS_KW = {"love", "fire", "happy", "laugh", "win", "cool", "best", "heart", "smile", "star", "clap", "pray", "sparkle", "sun", "rainbow"}
-_NEG_KW = {"angry", "sad", "cry", "worst", "bad", "hate", "skull", "vomit", "rage", "broken", "disappointed"}
-def _emoji_score(text: str):
-    score = 0
     for ch in text:
         if emoji.is_emoji(ch):
             name = emoji.demojize(ch)
-            if any(k in name for k in _POS_KW):
-                score += 0.2
-            elif any(k in name for k in _NEG_KW):
-                score -= 0.2
-    return score
-# ── Hinglish slang ─────────────────────────────────────────────────────────────
-_SLANG = {
-    # Positive
-    "mast":       "excellent",
-    "op":         "excellent",
-    "lit":        "amazing",
-    "sahi":       "correct good",
-    "jhakaas":    "awesome",
-    "kadak":      "strong good",
-    "zabardast":  "fantastic",
-    "kamaal":     "amazing",
-    "bindaas":    "great",
-    "ekdum":      "absolutely",
-    "shandar":    "splendid",
-    "lajawaab":   "outstanding",
-    "waah":       "wow great",
-    "wah":        "wow great",
-    "superb":     "excellent",
-    "osm":        "awesome",
-    "awsm":       "awesome",
-    "gr8":        "great",
-    "lajawab":    "outstanding",
-    "dhansu":     "awesome",
-    "fatafat":    "excellent quick",
-    "mazza":      "fun enjoyable",
-    "maja":       "fun enjoyable",
-    "acha":       "good",
-    "accha":      "good",
-    "badhiya":    "very good",
-    "shukriya":   "thank you grateful",
-    "dhanyawad":  "thank you grateful",
-    "love":       "love positive",
-    "pyaar":      "love positive",
-    # Negative
-    "bakwas":     "nonsense bad",
-    "faltu":      "useless bad",
-    "bekar":      "useless bad",
-    "ghatiya":    "terrible bad",
-    "wahiyat":    "awful bad",
-    "bura":       "bad negative",
-    "kharab":     "bad negative",
-    "boring":     "boring negative",
-    "bekaar":     "useless bad",
-    "chutiya":    "stupid offensive",
-    "ullu":       "fool negative",
-    "pagal":      "crazy negative",
-    "besharam":   "shameless negative",
-    "nafrat":     "hate negative",
-    "gussa":      "angry negative",
-    "naraaz":     "angry upset",
-    "dukh":       "sad negative",
-    "takleef":    "pain negative",
-    "mushkil":    "difficult negative",
-    "problem":    "problem negative",
 }
-def _preprocess(text: str) -> str:
-    text = _normalize_repeated_chars(text)
-    text = emoji.replace_emoji(
-        text,
-        replace=lambda ch, data_dict: f" {emoji.demojize(ch).strip(':')} " if emoji.is_emoji(ch) else ch
-    )
-    text = text.lower()
-    words = []
-    for w in text.split():
-        if w in _SLANG:
-            words.append(_SLANG[w])
-        else:
-            words.append(w)
-    text = " ".join(words)
-    text = re.sub(r"[^\w\s]", "", text)
-    return text.strip()
-# ── Fast path ──────────────────────────────────────────────────────────────────
-_POS_SLANG = {"mast", "op", "lit", "sahi", "jhakaas", "kadak", "zabardast", "kamaal",
-              "bindaas", "shandar", "lajawaab", "lajawab", "waah", "wah", "superb",
-              "osm", "awsm", "dhansu", "badhiya", "maja", "mazza", "acha", "accha",
-              "ekdum", "love", "pyaar", "shukriya", "dhanyawad"}
-_NEG_SLANG = {"bakwas", "faltu", "bekar", "bekaar", "ghatiya", "wahiyat", "bura",
-              "kharab", "boring", "ullu", "nafrat", "gussa", "naraaz"}
-def _fast_path(text: str):
-    stripped = text.strip().lower()
-    if len(stripped) <= 2:
-        return "Neutral", 0.6
-    words = set(stripped.split())
-    pos_hits = len(words & _POS_SLANG)
-    neg_hits = len(words & _NEG_SLANG)
-    if pos_hits > neg_hits and pos_hits >= 1:
-        return "Positive", min(0.75 + 0.05 * pos_hits, 0.92)
-    if neg_hits > pos_hits and neg_hits >= 1:
-        return "Negative", min(0.75 + 0.05 * neg_hits, 0.92)
-    return None
-# ── Model inference ────────────────────────────────────────────────────────────
-# Canonical label order used throughout the ensemble
-_CANONICAL = ["Negative", "Neutral", "Positive"]
-# Normalise a label string so casing/spacing differences don't matter — used in _align_probs
-def _align_probs(probs: torch.Tensor, id2label: dict) -> torch.Tensor:
     """
-    Reorder/collapse `probs` to always produce [Negative, Neutral, Positive].
-    Handles both 3-class and 5-class (Very Negative/Negative/Neutral/Positive/Very Positive) models.
     """
-    # 5-class: collapse Very Negative→Negative, Very Positive→Positive
-    _5CLASS_MAP = {
-        "very negative": 0, "negative": 0, "neg": 0,
-        "neutral":       1, "neu": 1,
-        "positive":      2, "pos": 2, "very positive": 2,
-    }
-    _3CLASS_MAP = {
-        "negative": 0, "neg": 0,
-        "neutral":  1, "neu": 1,
-        "positive": 2, "pos": 2,
-    }
-    label_map = _5CLASS_MAP if len(id2label) == 5 else _3CLASS_MAP
-    try:
-        aligned = torch.zeros(3, device=probs.device)
-        for native_idx, label in id2label.items():
-            canonical_idx = label_map[label.lower()]
-            aligned[canonical_idx] += probs[native_idx]
-        return aligned
-    except (KeyError, IndexError):
-        print(f"[sentiment] WARNING: could not align labels {id2label}, using raw order")
-        return probs[:3]
-def _infer_aligned(tokenizer, model, text: str) -> torch.Tensor:
-    """Run inference and return probs aligned to [Negative, Neutral, Positive]."""
-    device = _device()
-    inputs = tokenizer(
-        text,
-        return_tensors="pt",
-        truncation=True,
-        max_length=128,
-        padding=True,
-    ).to(device)
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    probs = F.softmax(logits, dim=-1).squeeze()
-    return _align_probs(probs, model.config.id2label)
-# ── Ensemble ───────────────────────────────────────────────────────────────────
-@lru_cache(maxsize=512)
-def _ensemble(text):
-    _load_models()
-    p_muril = _infer_aligned(_muril_tokenizer, _muril_model, text)
-    p_xlmr  = _infer_aligned(_xlmr_tokenizer,  _xlmr_model,  text)
-    p_multi = _infer_aligned(_multi_tokenizer,  _multi_model,  text)
-    probs = MURIL_WEIGHT * p_muril + XLMR_WEIGHT * p_xlmr + MULTI_WEIGHT * p_multi
-    conf, idx = torch.max(probs, dim=0)
-    return _CANONICAL[idx.item()], conf.item()
-# ── Public API ─────────────────────────────────────────────────────────────────
-def predict_sentiment(text: str):
-    fast = _fast_path(text)
-    if fast:
-        return fast
-    clean = _preprocess(text)
-    if not clean:
         return "Neutral", 0.55
-    label, conf = _ensemble(clean)
-    boost = _emoji_score(text)
-    conf = max(0, min(conf + boost, 1))
-    return label, round(conf, 2)

+# -*- coding: utf-8 -*-
+"""
+ml/sentiment_model.py
+=====================
+Pure keyword/rule-based sentiment classifier for YouTube live-chat comments.
+No ML models are loaded — classification is entirely keyword/regex-based.
+Approach
+--------
+1. Emoji scoring   — positive/negative emoji characters boost confidence
+2. Negation check  — "nahi accha" flips Positive → Negative
+3. Intensifier boost — "bahut accha" raises confidence
+4. Keyword matching — expanded Hinglish + English + regional + typo variants
+5. Fallback         — Neutral at 0.55 if nothing fires
+Public API
+----------
+    predict_sentiment(text: str) -> tuple[str, float]
+    Returns (label, confidence) where label ∈ {"Positive", "Neutral", "Negative"}
+    and confidence ∈ [0.50, 0.95].
+"""
 from __future__ import annotations
 import re
 import emoji
 # ── Emoji scoring ──────────────────────────────────────────────────────────────
+# Positive and negative emoji keyword sets (matched against demojized names)
+_EMOJI_POS_KW = {
+    "love", "fire", "happy", "laugh", "win", "cool", "best", "heart",
+    "smile", "star", "clap", "pray", "sparkle", "sun", "rainbow",
+    "thumbs_up", "raised_hands", "partying", "grinning", "beaming",
+    "smiling", "joy", "hundred", "muscle", "trophy", "crown",
+}
+_EMOJI_NEG_KW = {
+    "angry", "sad", "cry", "worst", "bad", "hate", "skull", "vomit",
+    "rage", "broken", "disappointed", "thumbs_down", "weary", "tired",
+    "loudly_crying", "fearful", "anguished", "confounded", "persevere",
+    "unamused", "expressionless", "nauseated", "sneezing",
+}
+def _emoji_score(text: str) -> float:
+    """Return a float in roughly [-0.4, 0.4] based on emoji sentiment."""
+    score = 0.0
     for ch in text:
         if emoji.is_emoji(ch):
             name = emoji.demojize(ch)
+            if any(k in name for k in _EMOJI_POS_KW):
+                score += 0.15
+            elif any(k in name for k in _EMOJI_NEG_KW):
+                score -= 0.15
+    return max(-0.4, min(score, 0.4))
+# ── Negation words ─────────────────────────────────────────────────────────────
+# These flip the sentiment of a keyword that follows within a short window.
+_NEGATION_WORDS: set[str] = {
+    # Hindi / Hinglish
+    "nahi", "nhi", "nahin", "na", "mat", "naa", "nope",
+    "bilkul nahi", "kabhi nahi", "kabhi nhi",
+    # English
+    "not", "no", "never", "neither", "nor", "without",
+    "don't", "dont", "doesn't", "doesnt", "didn't", "didnt",
+    "can't", "cant", "won't", "wont", "isn't", "isnt",
+    "wasn't", "wasnt", "aren't", "arent", "weren't", "werent",
+    "hardly", "barely", "scarcely",
 }
+# Window size: how many words before a sentiment word to check for negation
+_NEGATION_WINDOW = 3
+def _is_negated(word_list: list[str], sentiment_idx: int) -> bool:
+    """Return True if a negation word appears within _NEGATION_WINDOW words before OR after sentiment_idx.
+    Handles both:
+      - pre-negation:  "nahi accha tha"   (negation before sentiment word)
+      - post-negation: "boring nahi tha"  (negation after sentiment word)
+    """
+    # Look before
+    start = max(0, sentiment_idx - _NEGATION_WINDOW)
+    before = word_list[start:sentiment_idx]
+    if any(w in _NEGATION_WORDS for w in before):
+        return True
+    # Look after (smaller window — 2 words)
+    after = word_list[sentiment_idx + 1: sentiment_idx + 3]
+    return any(w in _NEGATION_WORDS for w in after)
+# ── Intensifier words ──────────────────────────────────────────────────────────
+# These amplify the confidence when they appear near a sentiment word.
+_INTENSIFIERS: dict[str, float] = {
+    # Hindi / Hinglish
+    "bahut":    0.10,   # very
+    "bohot":    0.10,
+    "bht":      0.08,
+    "ekdum":    0.12,   # absolutely
+    "bilkul":   0.10,   # completely
+    "itna":     0.08,   # this much
+    "kitna":    0.06,
+    "zyada":    0.08,   # more/too much
+    "bohat":    0.10,
+    "atyant":   0.10,   # extremely (formal Hindi)
+    "sampurn":  0.08,   # completely
+    # English
+    "very":     0.08,
+    "too":      0.08,
+    "so":       0.06,
+    "super":    0.10,
+    "ultra":    0.10,
+    "extremely": 0.12,
+    "absolutely": 0.12,
+    "totally":  0.10,
+    "really":   0.08,
+    "truly":    0.08,
+    "highly":   0.08,
+    "deeply":   0.08,
+    "insanely": 0.10,
+    "incredibly": 0.10,
+    "genuinely": 0.08,
+}
+_INTENSIFIER_WINDOW = 2
+def _intensifier_boost(word_list: list[str], sentiment_idx: int) -> float:
+    """Return confidence boost from intensifiers within _INTENSIFIER_WINDOW words before sentiment_idx."""
+    start = max(0, sentiment_idx - _INTENSIFIER_WINDOW)
+    window = word_list[start:sentiment_idx]
+    boost = sum(_INTENSIFIERS.get(w, 0.0) for w in window)
+    return min(boost, 0.15)   # cap single-word boost contribution
+# ── Positive keyword set ───────────────────────────────────────────────────────
+_POS_WORDS: set[str] = {
+    # ── Core Hinglish slang ──
+    "mast", "jhakaas", "kadak", "zabardast", "kamaal", "bindaas",
+    "shandar", "lajawaab", "lajawab", "lajaab", "waah", "wah",
+    "dhansu", "badhiya", "badiya", "maja", "mazza", "maza",
+    "acha", "accha", "achha", "acha", "sahi", "sach",
+    "shukriya", "dhanyawad", "dhanyavaad", "meherbani", "shukran",
+    "pyaar", "pyar", "khushi", "khush",
+    "fatafat", "jaldi",
+    # ── Typo / abbreviation variants ──
+    "osm", "awsm", "awsom", "awsome", "amzing", "amazng",
+    "gr8", "grt", "gr9", "fab", "fabbb",
+    "superrr", "amazinggg", "besttt", "niceee", "gooddd",
+    "thku", "thnku", "thnkuu", "thnkyou", "thanku", "thankyou",
+    "thnk", "thnq", "thnks", "thnx", "tysm", "tqsm", "thx", "ty",
+    "ty", "tyvm", "tyvmm",
+    # ── English positive ──
+    "amazing", "awesome", "excellent", "wonderful", "fantastic",
+    "brilliant", "outstanding", "exceptional", "magnificent",
+    "superb", "perfect", "great", "good", "nice", "beautiful",
+    "lovely", "loved", "love", "best", "better",
+    "helpful", "useful", "informative", "fruitful", "motivating",
+    "motivational", "inspiring", "inspired", "insightful",
+    "clear", "clarity", "simple", "easy", "smooth",
+    "thankful", "grateful", "blessed", "proud",
+    "happy", "glad", "pleased", "satisfied", "content",
+    "enjoy", "enjoyed", "enjoying", "fun", "interesting",
+    "impressive", "impressed", "incredible", "unbelievable",
+    "top", "topnotch", "firstclass", "worldclass",
+    "recommend", "recommended", "worth", "worthy",
+    "thanks", "thank", "appreciate", "appreciated",
+    "respect", "salute", "legend", "goat", "king", "queen",
+    "bestest", "bestttttt", "much", "op", "lit",
+    # ── Regional / South Indian Hinglish ──
+    "semma",        # Tamil slang for awesome
+    "mass",         # Tamil/Telugu slang for impressive
+    "vera level",   # Tamil slang for next level
+    "sema",         # variant of semma
+    "bindass",      # variant of bindaas
+    "dum",          # strength/power (positive context)
+    "dhamakedaar",  # explosive/amazing
+    "dhamaka",      # blast/amazing
+    "toofan",       # storm (used positively)
+    "jalwa",        # aura/presence (positive)
+    "josh",         # enthusiasm/energy
+    "full josh",
+    "paisa vasool", # worth the money
+    "makkhan",      # butter smooth (positive)
+    "solid",        # solid/strong (positive)
+    "tight",        # tight/solid (positive slang)
+    "fire",         # fire (positive slang)
+    "goated",       # GOAT-ed (positive slang)
+    "based",        # based (positive slang)
+    "valid",        # valid (positive slang)
+    "clean",        # clean explanation
+    # ── Gratitude phrases (single tokens after normalization) ──
+    "shukriyaa", "shukriyaaa", "dhanyawaad", "dhanyawaaad",
+    "abhar",        # gratitude (formal Hindi)
+    "aabhar",
+    # ── Common live chat positives ──
+    "woww", "wowww", "woah", "whoa", "yay", "yayy",
+    "haha", "hahaha", "lol", "lmao",   # laughter = positive
+    "clap", "claps", "bravo", "chappal",  # chappal = clap in some contexts
+    "heart", "hearts",
+    "100", "1000",   # "100%" positive
+}
+# ── Negative keyword set ───────────────────────────────────────────────────────
+_NEG_WORDS: set[str] = {
+    # ── Core Hinglish slang ──
+    "bakwas", "bakwaas", "bakwaaas",
+    "faltu", "faltuu",
+    "bekar", "bekaar", "bekaaar",
+    "ghatiya", "ghatiiya",
+    "wahiyat", "wahiyaat",
+    "bura", "buraa",
+    "kharab", "kharaaab",
+    "boring", "borring", "booring",
+    "ullu", "pagal", "paagal",
+    "besharam", "besharaam",
+    "nafrat", "gussa", "naraaz",
+    "dukh", "takleef", "mushkil",
+    "uruttu", "battamizi", "battameezi",
+    "natak", "nautanki",
+    "dhoka", "dhokha", "jhooth", "jhoota",
+    "dikhawa", "dikhaawa",
+    "beizzati", "beizzatii", "bezaati",
+    "sharam", "sharaam",
+    "galat", "galt",
+    "jhanjhat", "jhamela",
+    "tang", "pareshan", "pareshaan",
+    "nirasha", "niraash",      # disappointment
+    "thaka", "thakaan",        # tired/exhausted
+    "dard", "peeda",           # pain
+    "rona", "rota", "roti",    # crying
+    "cheat", "cheating",
+    "fraud", "fraudiya",
+    "loot", "loota", "looting",
+    # ── English negative ──
+    "useless", "unfair", "disappointing", "disappointed",
+    "foolish", "stupid", "idiot", "idiotic",
+    "terrible", "horrible", "awful", "dreadful",
+    "worst", "worse", "bad", "poor",
+    "waste", "wasted", "pathetic",
+    "annoying", "annoyed", "irritating", "irritated",
+    "frustrating", "frustrated", "frustration",
+    "confusing", "confused", "confusion",
+    "misleading", "clickbait",
+    "fake", "scam", "spam",
+    "hate", "hated", "hating",
+    "angry", "anger", "rage",
+    "sad", "sadness", "unhappy", "upset",
+    "wrong", "incorrect", "error", "mistake",
+    "problem", "issue", "bug", "broken",
+    "slow", "lagging", "lag", "buffering",
+    "crash", "crashed", "crashing",
+    "fail", "failed", "failure",
+    "ignore", "ignored", "ignoring",
+    "rude", "disrespect", "disrespectful",
+    "unfair", "biased", "bias",
+    "overpriced", "expensive", "costly",
+    "wtf", "wth", "omg",   # context-dependent but often negative in complaints
+    "curse", "abusive",
+    "liar", "lie", "lies",
+    "cheat", "cheater",
+    "regret", "regretted", "regrets",
+    "never", "worst",
+    # ── Typo / abbreviation variants ──
+    "bakwaaas", "bekarrr", "borinnng",
+    "worstttt", "terribleee",
+    # ── Regional / South Indian Hinglish ──
+    "kabaad",       # junk/trash
+    "raddi",        # waste/junk
+    "kachra",       # garbage
+    "bekar",        # useless (already above)
+    "nikamma",      # good-for-nothing
+    "nalayak",      # incompetent
+    "kamina",       # scoundrel
+    "harami",       # offensive negative
+    "bewakoof",     # fool
+    "gadha",        # donkey (fool)
+    "buddhu",       # fool
+    "duffer",       # dull/stupid
+    "flop",         # flop/failure
+    "disaster",     # disaster
+    "pathetic",     # pathetic (already above)
+    "cringe",       # cringe
+    "cap",          # cap = lie (slang)
+    "mid",          # mid = mediocre/bad (slang)
+    "trash",        # trash
+    "garbage",      # garbage
+    "dogwater",     # very bad (gaming slang)
+    "lowkey bad",
+    "not good",
+    "not helpful",
+    "not worth",
+    "time waste",
+    "time wasted",
+    "waste of time",
+}
+# ── Text normalisation ─────────────────────────────────────────────────────────
+def _normalise(text: str) -> str:
+    """Lowercase, strip emoji codes, collapse repeated chars, collapse whitespace."""
+    # Strip demojized emoji codes like :fire: :thumbs_up:
+    t = re.sub(r":[a-z_]+:", " ", text)
+    t = t.lower()
+    # Collapse 3+ repeated chars to 2: "amazinggg" → "amazingg", "niceee" → "nicee"
+    # (keeps double so "woww" still matches "woww" in keyword set)
+    t = re.sub(r"(.)\1{2,}", r"\1\1", t)
+    t = re.sub(r"\s+", " ", t).strip()
+    return t[:512]
+# ── Core classification ────────────────────────────────────────────────────────
+def _classify(text: str) -> tuple[str, float]:
+    """
+    Classify normalised text using keyword matching with negation and intensifier handling.
+    Returns (label, base_confidence) before emoji adjustment.
+    """
+    t = _normalise(text)
+    if len(t) <= 2:
+        return "Neutral", 0.55
+    word_list = t.split()
+    word_set  = set(word_list)
+    pos_score = 0.0
+    neg_score = 0.0
+    pos_boost = 0.0
+    neg_boost = 0.0
+    for idx, word in enumerate(word_list):
+        negated    = _is_negated(word_list, idx)
+        int_boost  = _intensifier_boost(word_list, idx)
+        if word in _POS_WORDS:
+            if negated:
+                neg_score += 1.0
+                neg_boost  = max(neg_boost, int_boost)
+            else:
+                pos_score += 1.0
+                pos_boost  = max(pos_boost, int_boost)
+        elif word in _NEG_WORDS:
+            if negated:
+                pos_score += 1.0
+                pos_boost  = max(pos_boost, int_boost)
+            else:
+                neg_score += 1.0
+                neg_boost  = max(neg_boost, int_boost)
+    # No keyword hits → Neutral
+    if pos_score == 0 and neg_score == 0:
+        return "Neutral", 0.55
+    # Determine winner
+    if pos_score > neg_score:
+        base_conf = min(0.72 + 0.05 * pos_score + pos_boost, 0.92)
+        return "Positive", round(base_conf, 3)
+    if neg_score > pos_score:
+        base_conf = min(0.72 + 0.05 * neg_score + neg_boost, 0.92)
+        return "Negative", round(base_conf, 3)
+    # Tie → Neutral with moderate confidence
+    return "Neutral", 0.58
+# ── Public API ─────────────────────────────────────────────────────────────────
+def predict_sentiment(text: str) -> tuple[str, float]:
     """
+    Classify a comment's sentiment.
+    Parameters
+    ----------
+    text : str
+        Raw comment text (may be Hinglish, emoji-containing, mixed script, or None).
+    Returns
+    -------
+    label : str
+        One of "Positive", "Neutral", "Negative".
+    confidence : float
+        Rule-based confidence in [0.50, 0.95].
+    Notes
+    -----
+    - Deterministic: same input always produces the same output.
+    - No ML models, no I/O, no side effects.
+    - None and empty/whitespace-only strings return ("Neutral", 0.55).
     """
+    if not text or not text.strip():
         return "Neutral", 0.55
+    label, conf = _classify(text)
+    # Adjust confidence by emoji sentiment in the original text
+    emoji_adj = _emoji_score(text)
+    conf = round(max(0.50, min(conf + emoji_adj, 0.95)), 3)
+    return label, conf

requirements.txt CHANGED Viewed

@@ -1,13 +1,8 @@
-# Core ML
-torch>=2.0.0
-transformers>=4.38.0
-sentencepiece>=0.1.99
 # Emoji + slang handling
 emoji>=2.10.0
 deep-translator>=1.11.4
-# Live chat scraping (now uses YouTube Data API v3 — no extra package needed)
 # Dashboard
 streamlit>=1.35.0

 # Emoji + slang handling
 emoji>=2.10.0
 deep-translator>=1.11.4
+# Live chat scraping (uses YouTube Data API v3 — no extra package needed)
 # Dashboard
 streamlit>=1.35.0