LivePulse / ml /topic_model.py
DivYonko
Keyword improvements from new CSV analysis - 28/28 tests passing
67899d6
# -*- coding: utf-8 -*-
"""
ml/topic_model.py
=================
Pure keyword/rule-based topic classifier for YouTube live-chat comments.
No ML models are loaded β€” classification is entirely keyword/regex-based.
Topics
------
Appreciation β€” praise, thanks, love, encouragement
Question β€” direct questions and doubts/confusion
Request/Feedback β€” content requests, faculty requests, feedback, suggestions
Promo β€” self-promotion, links, "check my channel"
Spam β€” repeated noise, irrelevant flood, gibberish
MCQ Answer β€” single letter answers (a/b/c/d/e)
General β€” anything that doesn't fit the above (fallback)
"""
from __future__ import annotations
import re
# ── Valid topics ───────────────────────────────────────────────────────────────
VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"}
# ── Keyword fast-path ──────────────────────────────────────────────────────────
_APPRECIATION_KW = {
"love", "thanks", "thank", "superb", "amazing", "excellent",
"awesome", "wonderful", "brilliant", "fantastic", "best", "perfect",
"mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab",
"waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm",
"dhansu", "pyaar", "bindaas", "khush", "happy",
"thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
"tysm", "tqsm", "thx",
"informative", "fruitful", "motivating", "lovely",
"bestest", "loved", "nice", "helpful",
"semma", "mass", "solid", "fire", "goated",
}
_QUESTION_KW = {
"kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi",
"kaise", "kyun", "kyunki",
"what", "when", "where", "who", "which", "how", "why",
"bata", "batao", "bataye", "tell", "explain",
"samajh", "confused", "confusion", "doubt", "unclear",
"matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha",
}
# Content requests β€” asking for new videos, topics, sessions
_RF_CONTENT_REQUEST_KW = {
"banao", "banana", "banaye", "banaiye", "banado",
"karo", "kariye", "karaiye", "kardo",
"lao", "laiye", "layiye",
"start", "shuru", "launch", "resume",
"video", "series", # removed "class" and "session" β€” too generic
"separate", "alag", "akele", "single",
"cover", "include", "add", "topic",
"chahiye", "chahte", "chahta", "chahti",
"request", "requesting",
}
# Academic/resource requests β€” PDFs, notes, downloads
_RF_ACADEMIC_KW = {
"pdf", "notes", "note", "download", "upload",
"drive", "google", "link", "material", "resource",
"timeline", "schedule", "timetable", "syllabus",
"infographic", "slides", "ppt", "handout",
"provide", "share", "send", "dedo", "dedijiye",
"milega", "milegi", "milenge", # "where to find" β€” specific to resource queries
}
# Language requests
_RF_LANGUAGE_KW = {
"hindi", "english", "medium", "language",
"translate", "translation",
}
# Feedback/suggestion keywords
_RF_FEEDBACK_KW = {
"side", "screen", "dikhta", "dikhai",
"correction", "correct", "galat", "wrong", "mistake",
"suggestion", "suggest", "improve", "better",
"feedback", "review", "opinion",
"sorry", "maafi", "apology",
"please", "plz", "pls", "plss", "plzz",
"dijiye", "dijie", "dena", "dedo",
"chahiye", "zaroorat", "need",
}
# Product/app feature requests
_RF_PRODUCT_KW = {
"app", "feature", "option", "button", "setting",
"notification", "reminder", "alert",
"website", "portal", "platform",
}
# Combined RF keyword set
_RF_ALL_KW = (
_RF_CONTENT_REQUEST_KW
| _RF_ACADEMIC_KW
| _RF_LANGUAGE_KW
| _RF_FEEDBACK_KW
| _RF_PRODUCT_KW
)
# Phrases that strongly indicate Request/Feedback (multi-word)
_RF_PHRASES = [
r"\bplease\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
r"\bpls\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
r"\bsir\s+(please|pls|plz)\b",
r"\b(pdf|notes?|material)\s+(upload|provide|share|send|dedo|dijiye)\b",
r"\b(separate|alag|akele)\s+(video|session|class|lecture)\b",
r"\b(hindi|english)\s+(medium|mein|me|pdf|notes?)\b",
r"\b(side|screen)\s+(ho|hojao|hojaye|jaiye)\b",
r"\b(correction|galat|wrong)\s+\w+\b",
r"\brequest\s+(hai|he|h|kar|karna)\b",
r"\b(chahiye|chahte|chahta|chahti)\s+\w+\b",
]
_SPAM_PATTERNS = [
r"^(.)\1{3,}$",
r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
r"https?://\S+",
r"_{4,}",
r"(?:\b[a-z0-9]{6,}\b\s*){6,}", # raised from 3 to 6 β€” avoids catching real sentences
]
_SPAM_KW_SUBSTRINGS = {
"onelink", "zazb", "gatewallah_official", "pwappweb",
"kuldeepsir_pw",
}
_PROMO_KW = {
"subscribe", "channel", "link", "instagram",
"check", "visit", "click", "http", "www", ".com", "telegram",
"https",
}
_MIN_FASTPATH_LEN = 4
# ── Classification ─────────────────────────────────────────────────────────────
def predict_topic(text: str) -> tuple[str, float]:
"""
Classify a comment into a topic category.
Parameters
----------
text : str
Raw comment text.
Returns
-------
topic : str
One of VALID_TOPICS.
confidence : float
Rule-based confidence in [0.50, 0.95].
Notes
-----
- Fully keyword/regex-based, no ML models.
- Anything that doesn't match a keyword falls back to "General".
"""
if not text or not text.strip():
return "General", 0.50
t = text.strip().lower()
t_clean = re.sub(r":[a-z_]+:", " ", t).strip()
t_clean = re.sub(r"\s+", " ", t_clean).strip()
# ── MCQ Answer: single letter or repeated letter(s) ──
if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean):
return "MCQ Answer", 0.95
if re.fullmatch(r"([a-e])\1*(\s*[,/]\s*([a-e])\3*)*", t_clean):
return "MCQ Answer", 0.95
# ── Spam: known spam substrings ──
if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS):
return "Spam", 0.90
# ── Spam/Promo: URL present ──
if re.search(r"https?://\S+", t_clean):
if any(kw in t_clean for kw in _PROMO_KW):
return "Promo", 0.85
return "Spam", 0.85
# ── Spam: repeated chars / gibberish ──
for pat in _SPAM_PATTERNS[:-1]:
if re.search(pat, t_clean):
return "Spam", 0.85
if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean):
return "Spam", 0.82
# ── Promo ──
if any(kw in t_clean for kw in _PROMO_KW):
return "Promo", 0.80
if len(t_clean) < _MIN_FASTPATH_LEN:
return "General", 0.55
words = set(t_clean.split())
has_question_mark = "?" in text
question_hits = len(words & _QUESTION_KW)
appreciation_hits = len(words & _APPRECIATION_KW)
rf_hits = len(words & _RF_ALL_KW)
# Check Request/Feedback phrase patterns (strong signal)
rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
# ── Appreciation ──
# Single strong appreciation word is enough regardless of length
min_appr_hits = 1
if (appreciation_hits >= min_appr_hits
and question_hits == 0
and not has_question_mark
and rf_hits == 0
and not rf_phrase_match):
return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92)
# ── Question ──
if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match:
return "Question", min(0.75 + 0.04 * question_hits, 0.92)
# ── Request/Feedback: phrase match ──
if rf_phrase_match:
return "Request/Feedback", 0.85
# ── Request/Feedback: keyword hits ──
min_rf_hits = 1 if len(t_clean) >= 20 else 2
if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark:
return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90)
# ── Fallback ──
return "General", 0.55