DivYonko commited on
Commit
b9d31ba
Β·
1 Parent(s): 1baaf30

Replace ML ensemble with pure keyword sentiment engine

Browse files
Files changed (4) hide show
  1. Dockerfile +0 -3
  2. README.md +7 -5
  3. ml/sentiment_model.py +373 -264
  4. requirements.txt +1 -6
Dockerfile CHANGED
@@ -13,9 +13,6 @@ RUN pip install --no-cache-dir -r requirements.txt
13
 
14
  COPY . .
15
 
16
- # Suppress Streamlit's file watcher scanning transformers (harmless but noisy)
17
- ENV STREAMLIT_SERVER_FILE_WATCHER_TYPE=none
18
-
19
  EXPOSE 7860
20
 
21
  HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
 
13
 
14
  COPY . .
15
 
 
 
 
16
  EXPOSE 7860
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
README.md CHANGED
@@ -15,12 +15,14 @@ Real-time Hinglish sentiment and topic analysis for YouTube live streams.
15
 
16
  ## Features
17
 
18
- - Real-time chat scraping via pytchat
19
- - Sentiment classification (Positive / Neutral / Negative) using a 3-model ensemble
20
- - Fine-tuned MuRIL (Hinglish-aware)
21
- - XLM-RoBERTa (multilingual Twitter model)
22
- - Multilingual sentiment model
 
23
  - Topic classification (Appreciation / Question / Promo / Spam / MCQ Answer / General)
 
24
  - Interactive Streamlit dashboard with live auto-refresh
25
  - Start/stop scraper directly from the UI
26
  - Multi-stream comparison (up to 5 streams)
 
15
 
16
  ## Features
17
 
18
+ - Real-time chat scraping via YouTube Data API v3
19
+ - Sentiment classification (Positive / Neutral / Negative) using a pure keyword engine
20
+ - Expanded Hinglish + English + regional slang keyword sets
21
+ - Negation handling ("nahi accha" β†’ Negative)
22
+ - Intensifier boost ("bahut accha" β†’ higher confidence)
23
+ - Emoji sentiment scoring
24
  - Topic classification (Appreciation / Question / Promo / Spam / MCQ Answer / General)
25
+ - Action type classification (28 fine-grained categories, fully keyword-based)
26
  - Interactive Streamlit dashboard with live auto-refresh
27
  - Start/stop scraper directly from the UI
28
  - Multi-stream comparison (up to 5 streams)
ml/sentiment_model.py CHANGED
@@ -1,305 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  import re
4
- import threading
5
- from functools import lru_cache
6
 
7
  import emoji
8
- import torch
9
- import torch.nn.functional as F
10
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
-
12
-
13
- # ── Model paths ────────────────────────────────────────────────────────────────
14
- MURIL_MODEL = "./new_trained_data/muril-sentimix"
15
- XLMR_MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
16
- MULTI_MODEL = "tabularisai/multilingual-sentiment-analysis"
17
-
18
- LABELS = ["Negative", "Neutral", "Positive"]
19
-
20
- # Weights
21
- MURIL_WEIGHT = 0.40
22
- XLMR_WEIGHT = 0.35
23
- MULTI_WEIGHT = 0.25
24
-
25
-
26
- # ── Lazy loading ───────────────────────────────────────────────────────────────
27
- _lock = threading.Lock()
28
-
29
- _muril_tokenizer = _muril_model = None
30
- _xlmr_tokenizer = _xlmr_model = None
31
- _multi_tokenizer = _multi_model = None
32
- _models_loaded = False
33
- _load_error: Exception | None = None
34
-
35
-
36
- def _load_models():
37
- global _muril_tokenizer, _muril_model
38
- global _xlmr_tokenizer, _xlmr_model
39
- global _multi_tokenizer, _multi_model
40
- global _models_loaded, _load_error
41
-
42
- if _models_loaded:
43
- return
44
-
45
- with _lock:
46
- if _models_loaded:
47
- return
48
-
49
- print("[sentiment] Loading models...")
50
- try:
51
- _muril_tokenizer = AutoTokenizer.from_pretrained(MURIL_MODEL)
52
- _muril_model = AutoModelForSequenceClassification.from_pretrained(MURIL_MODEL)
53
- print(f"[sentiment] MuRIL loaded β€” id2label: {_muril_model.config.id2label}")
54
-
55
- _xlmr_tokenizer = AutoTokenizer.from_pretrained(XLMR_MODEL)
56
- _xlmr_model = AutoModelForSequenceClassification.from_pretrained(XLMR_MODEL)
57
- print(f"[sentiment] XLM-R loaded β€” id2label: {_xlmr_model.config.id2label}")
58
-
59
- _multi_tokenizer = AutoTokenizer.from_pretrained(MULTI_MODEL)
60
- _multi_model = AutoModelForSequenceClassification.from_pretrained(MULTI_MODEL)
61
- print(f"[sentiment] Multilingual loaded β€” id2label: {_multi_model.config.id2label}")
62
-
63
- _muril_model.eval()
64
- _xlmr_model.eval()
65
- _multi_model.eval()
66
-
67
- if torch.cuda.is_available():
68
- _muril_model.to("cuda")
69
- _xlmr_model.to("cuda")
70
- _multi_model.to("cuda")
71
-
72
- _models_loaded = True
73
- print("[sentiment] All models ready βœ“")
74
-
75
- except Exception as exc:
76
- _load_error = exc
77
- print(f"[sentiment] ERROR loading models: {exc}")
78
- raise
79
-
80
-
81
- def _device():
82
- if not _models_loaded:
83
- _load_models()
84
- return next(_muril_model.parameters()).device
85
-
86
-
87
- # ── Text normalization ─────────────────────────────────────────────────────────
88
- def _normalize_repeated_chars(text: str) -> str:
89
- return re.sub(r"(.)\1{2,}", r"\1\1", text)
90
 
91
 
92
  # ── Emoji scoring ──────────────────────────────────────────────────────────────
93
- _POS_KW = {"love", "fire", "happy", "laugh", "win", "cool", "best", "heart", "smile", "star", "clap", "pray", "sparkle", "sun", "rainbow"}
94
- _NEG_KW = {"angry", "sad", "cry", "worst", "bad", "hate", "skull", "vomit", "rage", "broken", "disappointed"}
 
 
 
 
 
 
 
 
 
 
 
95
 
96
 
97
- def _emoji_score(text: str):
98
- score = 0
 
99
  for ch in text:
100
  if emoji.is_emoji(ch):
101
  name = emoji.demojize(ch)
102
- if any(k in name for k in _POS_KW):
103
- score += 0.2
104
- elif any(k in name for k in _NEG_KW):
105
- score -= 0.2
106
- return score
107
-
108
-
109
- # ── Hinglish slang ─────────────────────────────────────────────────────────────
110
- _SLANG = {
111
- # Positive
112
- "mast": "excellent",
113
- "op": "excellent",
114
- "lit": "amazing",
115
- "sahi": "correct good",
116
- "jhakaas": "awesome",
117
- "kadak": "strong good",
118
- "zabardast": "fantastic",
119
- "kamaal": "amazing",
120
- "bindaas": "great",
121
- "ekdum": "absolutely",
122
- "shandar": "splendid",
123
- "lajawaab": "outstanding",
124
- "waah": "wow great",
125
- "wah": "wow great",
126
- "superb": "excellent",
127
- "osm": "awesome",
128
- "awsm": "awesome",
129
- "gr8": "great",
130
- "lajawab": "outstanding",
131
- "dhansu": "awesome",
132
- "fatafat": "excellent quick",
133
- "mazza": "fun enjoyable",
134
- "maja": "fun enjoyable",
135
- "acha": "good",
136
- "accha": "good",
137
- "badhiya": "very good",
138
- "shukriya": "thank you grateful",
139
- "dhanyawad": "thank you grateful",
140
- "love": "love positive",
141
- "pyaar": "love positive",
142
-
143
- # Negative
144
- "bakwas": "nonsense bad",
145
- "faltu": "useless bad",
146
- "bekar": "useless bad",
147
- "ghatiya": "terrible bad",
148
- "wahiyat": "awful bad",
149
- "bura": "bad negative",
150
- "kharab": "bad negative",
151
- "boring": "boring negative",
152
- "bekaar": "useless bad",
153
- "chutiya": "stupid offensive",
154
- "ullu": "fool negative",
155
- "pagal": "crazy negative",
156
- "besharam": "shameless negative",
157
- "nafrat": "hate negative",
158
- "gussa": "angry negative",
159
- "naraaz": "angry upset",
160
- "dukh": "sad negative",
161
- "takleef": "pain negative",
162
- "mushkil": "difficult negative",
163
- "problem": "problem negative",
164
  }
165
 
 
 
166
 
167
- def _preprocess(text: str) -> str:
168
- text = _normalize_repeated_chars(text)
169
 
170
- text = emoji.replace_emoji(
171
- text,
172
- replace=lambda ch, data_dict: f" {emoji.demojize(ch).strip(':')} " if emoji.is_emoji(ch) else ch
173
- )
174
 
175
- text = text.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- words = []
178
- for w in text.split():
179
- if w in _SLANG:
180
- words.append(_SLANG[w])
181
- else:
182
- words.append(w)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- text = " ".join(words)
185
- text = re.sub(r"[^\w\s]", "", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- return text.strip()
188
 
 
189
 
190
- # ── Fast path ──────────────────────────────────────────────────────────────────
191
- _POS_SLANG = {"mast", "op", "lit", "sahi", "jhakaas", "kadak", "zabardast", "kamaal",
192
- "bindaas", "shandar", "lajawaab", "lajawab", "waah", "wah", "superb",
193
- "osm", "awsm", "dhansu", "badhiya", "maja", "mazza", "acha", "accha",
194
- "ekdum", "love", "pyaar", "shukriya", "dhanyawad"}
195
- _NEG_SLANG = {"bakwas", "faltu", "bekar", "bekaar", "ghatiya", "wahiyat", "bura",
196
- "kharab", "boring", "ullu", "nafrat", "gussa", "naraaz"}
 
 
 
197
 
198
 
199
- def _fast_path(text: str):
200
- stripped = text.strip().lower()
201
 
202
- if len(stripped) <= 2:
203
- return "Neutral", 0.6
 
204
 
205
- words = set(stripped.split())
 
 
206
 
207
- pos_hits = len(words & _POS_SLANG)
208
- neg_hits = len(words & _NEG_SLANG)
209
 
210
- if pos_hits > neg_hits and pos_hits >= 1:
211
- return "Positive", min(0.75 + 0.05 * pos_hits, 0.92)
212
- if neg_hits > pos_hits and neg_hits >= 1:
213
- return "Negative", min(0.75 + 0.05 * neg_hits, 0.92)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
- return None
 
 
 
216
 
 
 
 
217
 
218
- # ── Model inference ────────────────────────────────────────────────────────────
219
- # Canonical label order used throughout the ensemble
220
- _CANONICAL = ["Negative", "Neutral", "Positive"]
221
 
222
- # Normalise a label string so casing/spacing differences don't matter β€” used in _align_probs
223
 
 
224
 
225
- def _align_probs(probs: torch.Tensor, id2label: dict) -> torch.Tensor:
226
  """
227
- Reorder/collapse `probs` to always produce [Negative, Neutral, Positive].
228
- Handles both 3-class and 5-class (Very Negative/Negative/Neutral/Positive/Very Positive) models.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  """
230
- # 5-class: collapse Very Negative→Negative, Very Positive→Positive
231
- _5CLASS_MAP = {
232
- "very negative": 0, "negative": 0, "neg": 0,
233
- "neutral": 1, "neu": 1,
234
- "positive": 2, "pos": 2, "very positive": 2,
235
- }
236
- _3CLASS_MAP = {
237
- "negative": 0, "neg": 0,
238
- "neutral": 1, "neu": 1,
239
- "positive": 2, "pos": 2,
240
- }
241
- label_map = _5CLASS_MAP if len(id2label) == 5 else _3CLASS_MAP
242
- try:
243
- aligned = torch.zeros(3, device=probs.device)
244
- for native_idx, label in id2label.items():
245
- canonical_idx = label_map[label.lower()]
246
- aligned[canonical_idx] += probs[native_idx]
247
- return aligned
248
- except (KeyError, IndexError):
249
- print(f"[sentiment] WARNING: could not align labels {id2label}, using raw order")
250
- return probs[:3]
251
-
252
-
253
- def _infer_aligned(tokenizer, model, text: str) -> torch.Tensor:
254
- """Run inference and return probs aligned to [Negative, Neutral, Positive]."""
255
- device = _device()
256
-
257
- inputs = tokenizer(
258
- text,
259
- return_tensors="pt",
260
- truncation=True,
261
- max_length=128,
262
- padding=True,
263
- ).to(device)
264
-
265
- with torch.no_grad():
266
- logits = model(**inputs).logits
267
-
268
- probs = F.softmax(logits, dim=-1).squeeze()
269
- return _align_probs(probs, model.config.id2label)
270
-
271
-
272
- # ── Ensemble ───────────────────────────────────────────────────────────────────
273
- @lru_cache(maxsize=512)
274
- def _ensemble(text):
275
- _load_models()
276
-
277
- p_muril = _infer_aligned(_muril_tokenizer, _muril_model, text)
278
- p_xlmr = _infer_aligned(_xlmr_tokenizer, _xlmr_model, text)
279
- p_multi = _infer_aligned(_multi_tokenizer, _multi_model, text)
280
-
281
- probs = MURIL_WEIGHT * p_muril + XLMR_WEIGHT * p_xlmr + MULTI_WEIGHT * p_multi
282
-
283
- conf, idx = torch.max(probs, dim=0)
284
-
285
- return _CANONICAL[idx.item()], conf.item()
286
-
287
-
288
- # ── Public API ─────────────────────────────────────────────────────────────────
289
- def predict_sentiment(text: str):
290
-
291
- fast = _fast_path(text)
292
- if fast:
293
- return fast
294
-
295
- clean = _preprocess(text)
296
-
297
- if not clean:
298
  return "Neutral", 0.55
299
 
300
- label, conf = _ensemble(clean)
301
 
302
- boost = _emoji_score(text)
303
- conf = max(0, min(conf + boost, 1))
 
304
 
305
- return label, round(conf, 2)
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ml/sentiment_model.py
4
+ =====================
5
+ Pure keyword/rule-based sentiment classifier for YouTube live-chat comments.
6
+ No ML models are loaded β€” classification is entirely keyword/regex-based.
7
+
8
+ Approach
9
+ --------
10
+ 1. Emoji scoring β€” positive/negative emoji characters boost confidence
11
+ 2. Negation check β€” "nahi accha" flips Positive β†’ Negative
12
+ 3. Intensifier boost β€” "bahut accha" raises confidence
13
+ 4. Keyword matching β€” expanded Hinglish + English + regional + typo variants
14
+ 5. Fallback β€” Neutral at 0.55 if nothing fires
15
+
16
+ Public API
17
+ ----------
18
+ predict_sentiment(text: str) -> tuple[str, float]
19
+ Returns (label, confidence) where label ∈ {"Positive", "Neutral", "Negative"}
20
+ and confidence ∈ [0.50, 0.95].
21
+ """
22
+
23
  from __future__ import annotations
24
 
25
  import re
 
 
26
 
27
  import emoji
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  # ── Emoji scoring ──────────────────────────────────────────────────────────────
31
+ # Positive and negative emoji keyword sets (matched against demojized names)
32
+ _EMOJI_POS_KW = {
33
+ "love", "fire", "happy", "laugh", "win", "cool", "best", "heart",
34
+ "smile", "star", "clap", "pray", "sparkle", "sun", "rainbow",
35
+ "thumbs_up", "raised_hands", "partying", "grinning", "beaming",
36
+ "smiling", "joy", "hundred", "muscle", "trophy", "crown",
37
+ }
38
+ _EMOJI_NEG_KW = {
39
+ "angry", "sad", "cry", "worst", "bad", "hate", "skull", "vomit",
40
+ "rage", "broken", "disappointed", "thumbs_down", "weary", "tired",
41
+ "loudly_crying", "fearful", "anguished", "confounded", "persevere",
42
+ "unamused", "expressionless", "nauseated", "sneezing",
43
+ }
44
 
45
 
46
+ def _emoji_score(text: str) -> float:
47
+ """Return a float in roughly [-0.4, 0.4] based on emoji sentiment."""
48
+ score = 0.0
49
  for ch in text:
50
  if emoji.is_emoji(ch):
51
  name = emoji.demojize(ch)
52
+ if any(k in name for k in _EMOJI_POS_KW):
53
+ score += 0.15
54
+ elif any(k in name for k in _EMOJI_NEG_KW):
55
+ score -= 0.15
56
+ return max(-0.4, min(score, 0.4))
57
+
58
+
59
+ # ── Negation words ─────────────────────────────────────────────────────────────
60
+ # These flip the sentiment of a keyword that follows within a short window.
61
+ _NEGATION_WORDS: set[str] = {
62
+ # Hindi / Hinglish
63
+ "nahi", "nhi", "nahin", "na", "mat", "naa", "nope",
64
+ "bilkul nahi", "kabhi nahi", "kabhi nhi",
65
+ # English
66
+ "not", "no", "never", "neither", "nor", "without",
67
+ "don't", "dont", "doesn't", "doesnt", "didn't", "didnt",
68
+ "can't", "cant", "won't", "wont", "isn't", "isnt",
69
+ "wasn't", "wasnt", "aren't", "arent", "weren't", "werent",
70
+ "hardly", "barely", "scarcely",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  }
72
 
73
+ # Window size: how many words before a sentiment word to check for negation
74
+ _NEGATION_WINDOW = 3
75
 
 
 
76
 
77
+ def _is_negated(word_list: list[str], sentiment_idx: int) -> bool:
78
+ """Return True if a negation word appears within _NEGATION_WINDOW words before OR after sentiment_idx.
 
 
79
 
80
+ Handles both:
81
+ - pre-negation: "nahi accha tha" (negation before sentiment word)
82
+ - post-negation: "boring nahi tha" (negation after sentiment word)
83
+ """
84
+ # Look before
85
+ start = max(0, sentiment_idx - _NEGATION_WINDOW)
86
+ before = word_list[start:sentiment_idx]
87
+ if any(w in _NEGATION_WORDS for w in before):
88
+ return True
89
+ # Look after (smaller window β€” 2 words)
90
+ after = word_list[sentiment_idx + 1: sentiment_idx + 3]
91
+ return any(w in _NEGATION_WORDS for w in after)
92
+
93
+
94
+ # ── Intensifier words ──────────────────────────────────────────────────────────
95
+ # These amplify the confidence when they appear near a sentiment word.
96
+ _INTENSIFIERS: dict[str, float] = {
97
+ # Hindi / Hinglish
98
+ "bahut": 0.10, # very
99
+ "bohot": 0.10,
100
+ "bht": 0.08,
101
+ "ekdum": 0.12, # absolutely
102
+ "bilkul": 0.10, # completely
103
+ "itna": 0.08, # this much
104
+ "kitna": 0.06,
105
+ "zyada": 0.08, # more/too much
106
+ "bohat": 0.10,
107
+ "atyant": 0.10, # extremely (formal Hindi)
108
+ "sampurn": 0.08, # completely
109
+ # English
110
+ "very": 0.08,
111
+ "too": 0.08,
112
+ "so": 0.06,
113
+ "super": 0.10,
114
+ "ultra": 0.10,
115
+ "extremely": 0.12,
116
+ "absolutely": 0.12,
117
+ "totally": 0.10,
118
+ "really": 0.08,
119
+ "truly": 0.08,
120
+ "highly": 0.08,
121
+ "deeply": 0.08,
122
+ "insanely": 0.10,
123
+ "incredibly": 0.10,
124
+ "genuinely": 0.08,
125
+ }
126
 
127
+ _INTENSIFIER_WINDOW = 2
128
+
129
+
130
+ def _intensifier_boost(word_list: list[str], sentiment_idx: int) -> float:
131
+ """Return confidence boost from intensifiers within _INTENSIFIER_WINDOW words before sentiment_idx."""
132
+ start = max(0, sentiment_idx - _INTENSIFIER_WINDOW)
133
+ window = word_list[start:sentiment_idx]
134
+ boost = sum(_INTENSIFIERS.get(w, 0.0) for w in window)
135
+ return min(boost, 0.15) # cap single-word boost contribution
136
+
137
+
138
+ # ── Positive keyword set ───────────────────────────────────────────────────────
139
+ _POS_WORDS: set[str] = {
140
+ # ── Core Hinglish slang ──
141
+ "mast", "jhakaas", "kadak", "zabardast", "kamaal", "bindaas",
142
+ "shandar", "lajawaab", "lajawab", "lajaab", "waah", "wah",
143
+ "dhansu", "badhiya", "badiya", "maja", "mazza", "maza",
144
+ "acha", "accha", "achha", "acha", "sahi", "sach",
145
+ "shukriya", "dhanyawad", "dhanyavaad", "meherbani", "shukran",
146
+ "pyaar", "pyar", "khushi", "khush",
147
+ "fatafat", "jaldi",
148
+
149
+ # ── Typo / abbreviation variants ──
150
+ "osm", "awsm", "awsom", "awsome", "amzing", "amazng",
151
+ "gr8", "grt", "gr9", "fab", "fabbb",
152
+ "superrr", "amazinggg", "besttt", "niceee", "gooddd",
153
+ "thku", "thnku", "thnkuu", "thnkyou", "thanku", "thankyou",
154
+ "thnk", "thnq", "thnks", "thnx", "tysm", "tqsm", "thx", "ty",
155
+ "ty", "tyvm", "tyvmm",
156
+
157
+ # ── English positive ──
158
+ "amazing", "awesome", "excellent", "wonderful", "fantastic",
159
+ "brilliant", "outstanding", "exceptional", "magnificent",
160
+ "superb", "perfect", "great", "good", "nice", "beautiful",
161
+ "lovely", "loved", "love", "best", "better",
162
+ "helpful", "useful", "informative", "fruitful", "motivating",
163
+ "motivational", "inspiring", "inspired", "insightful",
164
+ "clear", "clarity", "simple", "easy", "smooth",
165
+ "thankful", "grateful", "blessed", "proud",
166
+ "happy", "glad", "pleased", "satisfied", "content",
167
+ "enjoy", "enjoyed", "enjoying", "fun", "interesting",
168
+ "impressive", "impressed", "incredible", "unbelievable",
169
+ "top", "topnotch", "firstclass", "worldclass",
170
+ "recommend", "recommended", "worth", "worthy",
171
+ "thanks", "thank", "appreciate", "appreciated",
172
+ "respect", "salute", "legend", "goat", "king", "queen",
173
+ "bestest", "bestttttt", "much", "op", "lit",
174
+
175
+ # ── Regional / South Indian Hinglish ──
176
+ "semma", # Tamil slang for awesome
177
+ "mass", # Tamil/Telugu slang for impressive
178
+ "vera level", # Tamil slang for next level
179
+ "sema", # variant of semma
180
+ "bindass", # variant of bindaas
181
+ "dum", # strength/power (positive context)
182
+ "dhamakedaar", # explosive/amazing
183
+ "dhamaka", # blast/amazing
184
+ "toofan", # storm (used positively)
185
+ "jalwa", # aura/presence (positive)
186
+ "josh", # enthusiasm/energy
187
+ "full josh",
188
+ "paisa vasool", # worth the money
189
+ "makkhan", # butter smooth (positive)
190
+ "solid", # solid/strong (positive)
191
+ "tight", # tight/solid (positive slang)
192
+ "fire", # fire (positive slang)
193
+ "goated", # GOAT-ed (positive slang)
194
+ "based", # based (positive slang)
195
+ "valid", # valid (positive slang)
196
+ "clean", # clean explanation
197
+
198
+ # ── Gratitude phrases (single tokens after normalization) ──
199
+ "shukriyaa", "shukriyaaa", "dhanyawaad", "dhanyawaaad",
200
+ "abhar", # gratitude (formal Hindi)
201
+ "aabhar",
202
+
203
+ # ── Common live chat positives ──
204
+ "woww", "wowww", "woah", "whoa", "yay", "yayy",
205
+ "haha", "hahaha", "lol", "lmao", # laughter = positive
206
+ "clap", "claps", "bravo", "chappal", # chappal = clap in some contexts
207
+ "heart", "hearts",
208
+ "100", "1000", # "100%" positive
209
+ }
210
 
211
+ # ── Negative keyword set ───────────────────────────────────────────────────────
212
+ _NEG_WORDS: set[str] = {
213
+ # ── Core Hinglish slang ──
214
+ "bakwas", "bakwaas", "bakwaaas",
215
+ "faltu", "faltuu",
216
+ "bekar", "bekaar", "bekaaar",
217
+ "ghatiya", "ghatiiya",
218
+ "wahiyat", "wahiyaat",
219
+ "bura", "buraa",
220
+ "kharab", "kharaaab",
221
+ "boring", "borring", "booring",
222
+ "ullu", "pagal", "paagal",
223
+ "besharam", "besharaam",
224
+ "nafrat", "gussa", "naraaz",
225
+ "dukh", "takleef", "mushkil",
226
+ "uruttu", "battamizi", "battameezi",
227
+ "natak", "nautanki",
228
+ "dhoka", "dhokha", "jhooth", "jhoota",
229
+ "dikhawa", "dikhaawa",
230
+ "beizzati", "beizzatii", "bezaati",
231
+ "sharam", "sharaam",
232
+ "galat", "galt",
233
+ "jhanjhat", "jhamela",
234
+ "tang", "pareshan", "pareshaan",
235
+ "nirasha", "niraash", # disappointment
236
+ "thaka", "thakaan", # tired/exhausted
237
+ "dard", "peeda", # pain
238
+ "rona", "rota", "roti", # crying
239
+ "cheat", "cheating",
240
+ "fraud", "fraudiya",
241
+ "loot", "loota", "looting",
242
+
243
+ # ── English negative ──
244
+ "useless", "unfair", "disappointing", "disappointed",
245
+ "foolish", "stupid", "idiot", "idiotic",
246
+ "terrible", "horrible", "awful", "dreadful",
247
+ "worst", "worse", "bad", "poor",
248
+ "waste", "wasted", "pathetic",
249
+ "annoying", "annoyed", "irritating", "irritated",
250
+ "frustrating", "frustrated", "frustration",
251
+ "confusing", "confused", "confusion",
252
+ "misleading", "clickbait",
253
+ "fake", "scam", "spam",
254
+ "hate", "hated", "hating",
255
+ "angry", "anger", "rage",
256
+ "sad", "sadness", "unhappy", "upset",
257
+ "wrong", "incorrect", "error", "mistake",
258
+ "problem", "issue", "bug", "broken",
259
+ "slow", "lagging", "lag", "buffering",
260
+ "crash", "crashed", "crashing",
261
+ "fail", "failed", "failure",
262
+ "ignore", "ignored", "ignoring",
263
+ "rude", "disrespect", "disrespectful",
264
+ "unfair", "biased", "bias",
265
+ "overpriced", "expensive", "costly",
266
+ "wtf", "wth", "omg", # context-dependent but often negative in complaints
267
+ "curse", "abusive",
268
+ "liar", "lie", "lies",
269
+ "cheat", "cheater",
270
+ "regret", "regretted", "regrets",
271
+ "never", "worst",
272
+
273
+ # ── Typo / abbreviation variants ──
274
+ "bakwaaas", "bekarrr", "borinnng",
275
+ "worstttt", "terribleee",
276
+
277
+ # ── Regional / South Indian Hinglish ──
278
+ "kabaad", # junk/trash
279
+ "raddi", # waste/junk
280
+ "kachra", # garbage
281
+ "bekar", # useless (already above)
282
+ "nikamma", # good-for-nothing
283
+ "nalayak", # incompetent
284
+ "kamina", # scoundrel
285
+ "harami", # offensive negative
286
+ "bewakoof", # fool
287
+ "gadha", # donkey (fool)
288
+ "buddhu", # fool
289
+ "duffer", # dull/stupid
290
+ "flop", # flop/failure
291
+ "disaster", # disaster
292
+ "pathetic", # pathetic (already above)
293
+ "cringe", # cringe
294
+ "cap", # cap = lie (slang)
295
+ "mid", # mid = mediocre/bad (slang)
296
+ "trash", # trash
297
+ "garbage", # garbage
298
+ "dogwater", # very bad (gaming slang)
299
+ "lowkey bad",
300
+ "not good",
301
+ "not helpful",
302
+ "not worth",
303
+ "time waste",
304
+ "time wasted",
305
+ "waste of time",
306
+ }
307
 
 
308
 
309
+ # ── Text normalisation ─────────────────────────────────────────────────────────
310
 
311
+ def _normalise(text: str) -> str:
312
+ """Lowercase, strip emoji codes, collapse repeated chars, collapse whitespace."""
313
+ # Strip demojized emoji codes like :fire: :thumbs_up:
314
+ t = re.sub(r":[a-z_]+:", " ", text)
315
+ t = t.lower()
316
+ # Collapse 3+ repeated chars to 2: "amazinggg" β†’ "amazingg", "niceee" β†’ "nicee"
317
+ # (keeps double so "woww" still matches "woww" in keyword set)
318
+ t = re.sub(r"(.)\1{2,}", r"\1\1", t)
319
+ t = re.sub(r"\s+", " ", t).strip()
320
+ return t[:512]
321
 
322
 
323
+ # ── Core classification ────────────────────────────────────────────────────────
 
324
 
325
+ def _classify(text: str) -> tuple[str, float]:
326
+ """
327
+ Classify normalised text using keyword matching with negation and intensifier handling.
328
 
329
+ Returns (label, base_confidence) before emoji adjustment.
330
+ """
331
+ t = _normalise(text)
332
 
333
+ if len(t) <= 2:
334
+ return "Neutral", 0.55
335
 
336
+ word_list = t.split()
337
+ word_set = set(word_list)
338
+
339
+ pos_score = 0.0
340
+ neg_score = 0.0
341
+ pos_boost = 0.0
342
+ neg_boost = 0.0
343
+
344
+ for idx, word in enumerate(word_list):
345
+ negated = _is_negated(word_list, idx)
346
+ int_boost = _intensifier_boost(word_list, idx)
347
+
348
+ if word in _POS_WORDS:
349
+ if negated:
350
+ neg_score += 1.0
351
+ neg_boost = max(neg_boost, int_boost)
352
+ else:
353
+ pos_score += 1.0
354
+ pos_boost = max(pos_boost, int_boost)
355
+
356
+ elif word in _NEG_WORDS:
357
+ if negated:
358
+ pos_score += 1.0
359
+ pos_boost = max(pos_boost, int_boost)
360
+ else:
361
+ neg_score += 1.0
362
+ neg_boost = max(neg_boost, int_boost)
363
+
364
+ # No keyword hits β†’ Neutral
365
+ if pos_score == 0 and neg_score == 0:
366
+ return "Neutral", 0.55
367
 
368
+ # Determine winner
369
+ if pos_score > neg_score:
370
+ base_conf = min(0.72 + 0.05 * pos_score + pos_boost, 0.92)
371
+ return "Positive", round(base_conf, 3)
372
 
373
+ if neg_score > pos_score:
374
+ base_conf = min(0.72 + 0.05 * neg_score + neg_boost, 0.92)
375
+ return "Negative", round(base_conf, 3)
376
 
377
+ # Tie β†’ Neutral with moderate confidence
378
+ return "Neutral", 0.58
 
379
 
 
380
 
381
+ # ── Public API ─────────────────────────────────────────────────────────────────
382
 
383
+ def predict_sentiment(text: str) -> tuple[str, float]:
384
  """
385
+ Classify a comment's sentiment.
386
+
387
+ Parameters
388
+ ----------
389
+ text : str
390
+ Raw comment text (may be Hinglish, emoji-containing, mixed script, or None).
391
+
392
+ Returns
393
+ -------
394
+ label : str
395
+ One of "Positive", "Neutral", "Negative".
396
+ confidence : float
397
+ Rule-based confidence in [0.50, 0.95].
398
+
399
+ Notes
400
+ -----
401
+ - Deterministic: same input always produces the same output.
402
+ - No ML models, no I/O, no side effects.
403
+ - None and empty/whitespace-only strings return ("Neutral", 0.55).
404
  """
405
+ if not text or not text.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  return "Neutral", 0.55
407
 
408
+ label, conf = _classify(text)
409
 
410
+ # Adjust confidence by emoji sentiment in the original text
411
+ emoji_adj = _emoji_score(text)
412
+ conf = round(max(0.50, min(conf + emoji_adj, 0.95)), 3)
413
 
414
+ return label, conf
requirements.txt CHANGED
@@ -1,13 +1,8 @@
1
- # Core ML
2
- torch>=2.0.0
3
- transformers>=4.38.0
4
- sentencepiece>=0.1.99
5
-
6
  # Emoji + slang handling
7
  emoji>=2.10.0
8
  deep-translator>=1.11.4
9
 
10
- # Live chat scraping (now uses YouTube Data API v3 β€” no extra package needed)
11
 
12
  # Dashboard
13
  streamlit>=1.35.0
 
 
 
 
 
 
1
  # Emoji + slang handling
2
  emoji>=2.10.0
3
  deep-translator>=1.11.4
4
 
5
+ # Live chat scraping (uses YouTube Data API v3 β€” no extra package needed)
6
 
7
  # Dashboard
8
  streamlit>=1.35.0