Spaces:

Divyonko
/

LivePulse

Sleeping

LivePulse / ml /topic_model.py

DivYonko

Keyword improvements from new CSV analysis - 28/28 tests passing

67899d6 8 days ago

8.43 kB

	# -- coding: utf-8 --
	"""
	ml/topic_model.py
	=================
	Pure keyword/rule-based topic classifier for YouTube live-chat comments.
	No ML models are loaded — classification is entirely keyword/regex-based.

	Topics
	------
	Appreciation — praise, thanks, love, encouragement
	Question — direct questions and doubts/confusion
	Request/Feedback — content requests, faculty requests, feedback, suggestions
	Promo — self-promotion, links, "check my channel"
	Spam — repeated noise, irrelevant flood, gibberish
	MCQ Answer — single letter answers (a/b/c/d/e)
	General — anything that doesn't fit the above (fallback)
	"""

	from __future__ import annotations

	import re

	# ── Valid topics ───────────────────────────────────────────────────────────────
	VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"}

	# ── Keyword fast-path ──────────────────────────────────────────────────────────
	_APPRECIATION_KW = {
	"love", "thanks", "thank", "superb", "amazing", "excellent",
	"awesome", "wonderful", "brilliant", "fantastic", "best", "perfect",
	"mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab",
	"waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm",
	"dhansu", "pyaar", "bindaas", "khush", "happy",
	"thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
	"tysm", "tqsm", "thx",
	"informative", "fruitful", "motivating", "lovely",
	"bestest", "loved", "nice", "helpful",
	"semma", "mass", "solid", "fire", "goated",
	}

	_QUESTION_KW = {
	"kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi",
	"kaise", "kyun", "kyunki",
	"what", "when", "where", "who", "which", "how", "why",
	"bata", "batao", "bataye", "tell", "explain",
	"samajh", "confused", "confusion", "doubt", "unclear",
	"matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha",
	}

	# Content requests — asking for new videos, topics, sessions
	_RF_CONTENT_REQUEST_KW = {
	"banao", "banana", "banaye", "banaiye", "banado",
	"karo", "kariye", "karaiye", "kardo",
	"lao", "laiye", "layiye",
	"start", "shuru", "launch", "resume",
	"video", "series", # removed "class" and "session" — too generic
	"separate", "alag", "akele", "single",
	"cover", "include", "add", "topic",
	"chahiye", "chahte", "chahta", "chahti",
	"request", "requesting",
	}

	# Academic/resource requests — PDFs, notes, downloads
	_RF_ACADEMIC_KW = {
	"pdf", "notes", "note", "download", "upload",
	"drive", "google", "link", "material", "resource",
	"timeline", "schedule", "timetable", "syllabus",
	"infographic", "slides", "ppt", "handout",
	"provide", "share", "send", "dedo", "dedijiye",
	"milega", "milegi", "milenge", # "where to find" — specific to resource queries
	}

	# Language requests
	_RF_LANGUAGE_KW = {
	"hindi", "english", "medium", "language",
	"translate", "translation",
	}

	# Feedback/suggestion keywords
	_RF_FEEDBACK_KW = {
	"side", "screen", "dikhta", "dikhai",
	"correction", "correct", "galat", "wrong", "mistake",
	"suggestion", "suggest", "improve", "better",
	"feedback", "review", "opinion",
	"sorry", "maafi", "apology",
	"please", "plz", "pls", "plss", "plzz",
	"dijiye", "dijie", "dena", "dedo",
	"chahiye", "zaroorat", "need",
	}

	# Product/app feature requests
	_RF_PRODUCT_KW = {
	"app", "feature", "option", "button", "setting",
	"notification", "reminder", "alert",
	"website", "portal", "platform",
	}

	# Combined RF keyword set
	_RF_ALL_KW = (
	_RF_CONTENT_REQUEST_KW
	\| _RF_ACADEMIC_KW
	\| _RF_LANGUAGE_KW
	\| _RF_FEEDBACK_KW
	\| _RF_PRODUCT_KW
	)

	# Phrases that strongly indicate Request/Feedback (multi-word)
	_RF_PHRASES = [
	r"\bplease\s+\w+\s+(karo\|kijiye\|dijiye\|banao\|lao\|upload\|provide\|start)\b",
	r"\bpls\s+\w+\s+(karo\|kijiye\|dijiye\|banao\|lao\|upload\|provide\|start)\b",
	r"\bsir\s+(please\|pls\|plz)\b",
	r"\b(pdf\|notes?\|material)\s+(upload\|provide\|share\|send\|dedo\|dijiye)\b",
	r"\b(separate\|alag\|akele)\s+(video\|session\|class\|lecture)\b",
	r"\b(hindi\|english)\s+(medium\|mein\|me\|pdf\|notes?)\b",
	r"\b(side\|screen)\s+(ho\|hojao\|hojaye\|jaiye)\b",
	r"\b(correction\|galat\|wrong)\s+\w+\b",
	r"\brequest\s+(hai\|he\|h\|kar\|karna)\b",
	r"\b(chahiye\|chahte\|chahta\|chahti)\s+\w+\b",
	]

	_SPAM_PATTERNS = [
	r"^(.)\1{3,}$",
	r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
	r"https?://\S+",
	r"_{4,}",
	r"(?:\b[a-z0-9]{6,}\b\s*){6,}", # raised from 3 to 6 — avoids catching real sentences
	]

	_SPAM_KW_SUBSTRINGS = {
	"onelink", "zazb", "gatewallah_official", "pwappweb",
	"kuldeepsir_pw",
	}

	_PROMO_KW = {
	"subscribe", "channel", "link", "instagram",
	"check", "visit", "click", "http", "www", ".com", "telegram",
	"https",
	}

	_MIN_FASTPATH_LEN = 4


	# ── Classification ─────────────────────────────────────────────────────────────

	def predict_topic(text: str) -> tuple[str, float]:
	"""
	Classify a comment into a topic category.

	Parameters
	----------
	text : str
	Raw comment text.

	Returns
	-------
	topic : str
	One of VALID_TOPICS.
	confidence : float
	Rule-based confidence in [0.50, 0.95].

	Notes
	-----
	- Fully keyword/regex-based, no ML models.
	- Anything that doesn't match a keyword falls back to "General".
	"""
	if not text or not text.strip():
	return "General", 0.50

	t = text.strip().lower()
	t_clean = re.sub(r":[a-z_]+:", " ", t).strip()
	t_clean = re.sub(r"\s+", " ", t_clean).strip()

	# ── MCQ Answer: single letter or repeated letter(s) ──
	if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean):
	return "MCQ Answer", 0.95
	if re.fullmatch(r"([a-e])\1(\s[,/]\s([a-e])\3)*", t_clean):
	return "MCQ Answer", 0.95

	# ── Spam: known spam substrings ──
	if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS):
	return "Spam", 0.90

	# ── Spam/Promo: URL present ──
	if re.search(r"https?://\S+", t_clean):
	if any(kw in t_clean for kw in _PROMO_KW):
	return "Promo", 0.85
	return "Spam", 0.85

	# ── Spam: repeated chars / gibberish ──
	for pat in _SPAM_PATTERNS[:-1]:
	if re.search(pat, t_clean):
	return "Spam", 0.85
	if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean):
	return "Spam", 0.82

	# ── Promo ──
	if any(kw in t_clean for kw in _PROMO_KW):
	return "Promo", 0.80

	if len(t_clean) < _MIN_FASTPATH_LEN:
	return "General", 0.55

	words = set(t_clean.split())
	has_question_mark = "?" in text

	question_hits = len(words & _QUESTION_KW)
	appreciation_hits = len(words & _APPRECIATION_KW)
	rf_hits = len(words & _RF_ALL_KW)

	# Check Request/Feedback phrase patterns (strong signal)
	rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)

	# ── Appreciation ──
	# Single strong appreciation word is enough regardless of length
	min_appr_hits = 1
	if (appreciation_hits >= min_appr_hits
	and question_hits == 0
	and not has_question_mark
	and rf_hits == 0
	and not rf_phrase_match):
	return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92)

	# ── Question ──
	if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match:
	return "Question", min(0.75 + 0.04 * question_hits, 0.92)

	# ── Request/Feedback: phrase match ──
	if rf_phrase_match:
	return "Request/Feedback", 0.85

	# ── Request/Feedback: keyword hits ──
	min_rf_hits = 1 if len(t_clean) >= 20 else 2
	if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark:
	return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90)

	# ── Fallback ──
	return "General", 0.55