Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- text_features.py +35 -17
text_features.py
CHANGED
|
@@ -11,14 +11,15 @@ KEY IMPROVEMENTS:
|
|
| 11 |
import numpy as np
|
| 12 |
from typing import List, Dict, Tuple
|
| 13 |
from transformers import pipeline
|
| 14 |
-
from sentence_transformers import SentenceTransformer
|
| 15 |
import re
|
| 16 |
|
| 17 |
|
| 18 |
class TextFeatureExtractor:
|
| 19 |
"""Extract 9 text features for busy detection"""
|
| 20 |
|
| 21 |
-
def __init__(self, use_intent_model: bool = True
|
|
|
|
| 22 |
"""
|
| 23 |
Initialize NLP models
|
| 24 |
|
|
@@ -39,9 +40,22 @@ class TextFeatureExtractor:
|
|
| 39 |
)
|
| 40 |
print("[OK] Sentiment model loaded")
|
| 41 |
|
| 42 |
-
# Coherence model
|
| 43 |
-
self.coherence_model =
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# Always setup patterns — busy_keywords is needed by extract_marker_counts()
|
| 47 |
self._setup_patterns()
|
|
@@ -278,8 +292,8 @@ class TextFeatureExtractor:
|
|
| 278 |
if keyword in transcript_lower
|
| 279 |
)
|
| 280 |
|
| 281 |
-
# Normalize by total words
|
| 282 |
-
cognitive_load = cognitive_load_count / total_words
|
| 283 |
time_pressure = time_pressure_count / total_words
|
| 284 |
deflection = deflection_count / total_words
|
| 285 |
|
|
@@ -319,16 +333,20 @@ class TextFeatureExtractor:
|
|
| 319 |
return 0.5 # Neutral if no data (changed from 1.0 to be more conservative)
|
| 320 |
|
| 321 |
try:
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
return max(0.0, min(1.0, coherence)) # Clamp to [0, 1]
|
| 334 |
except Exception as e:
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
from typing import List, Dict, Tuple
|
| 13 |
from transformers import pipeline
|
| 14 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 15 |
import re
|
| 16 |
|
| 17 |
|
| 18 |
class TextFeatureExtractor:
|
| 19 |
"""Extract 9 text features for busy detection"""
|
| 20 |
|
| 21 |
+
def __init__(self, use_intent_model: bool = True, marker_alpha: float = 1.0, marker_beta: float = 1.0,
|
| 22 |
+
coherence_model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
|
| 23 |
"""
|
| 24 |
Initialize NLP models
|
| 25 |
|
|
|
|
| 40 |
)
|
| 41 |
print("[OK] Sentiment model loaded")
|
| 42 |
|
| 43 |
+
# Coherence model (cross-encoder for next-utterance plausibility)
|
| 44 |
+
self.coherence_model = None
|
| 45 |
+
self.coherence_model_is_cross = True
|
| 46 |
+
self.coherence_model_name = coherence_model_name
|
| 47 |
+
try:
|
| 48 |
+
self.coherence_model = CrossEncoder(self.coherence_model_name, device="cpu")
|
| 49 |
+
print(f"[OK] Coherence model loaded (CrossEncoder: {self.coherence_model_name})")
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"[WARN] CrossEncoder load failed: {e}")
|
| 52 |
+
self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 53 |
+
self.coherence_model_is_cross = False
|
| 54 |
+
print("[OK] Coherence model loaded (SentenceTransformer fallback)")
|
| 55 |
+
|
| 56 |
+
# Marker smoothing
|
| 57 |
+
self.marker_alpha = float(marker_alpha)
|
| 58 |
+
self.marker_beta = float(marker_beta)
|
| 59 |
|
| 60 |
# Always setup patterns — busy_keywords is needed by extract_marker_counts()
|
| 61 |
self._setup_patterns()
|
|
|
|
| 292 |
if keyword in transcript_lower
|
| 293 |
)
|
| 294 |
|
| 295 |
+
# Normalize by total words with smoothing (cognitive load only)
|
| 296 |
+
cognitive_load = (cognitive_load_count + self.marker_alpha) / (total_words + self.marker_beta)
|
| 297 |
time_pressure = time_pressure_count / total_words
|
| 298 |
deflection = deflection_count / total_words
|
| 299 |
|
|
|
|
| 333 |
return 0.5 # Neutral if no data (changed from 1.0 to be more conservative)
|
| 334 |
|
| 335 |
try:
|
| 336 |
+
if self.coherence_model_is_cross:
|
| 337 |
+
pairs = [(question, response) for response in responses]
|
| 338 |
+
scores = self.coherence_model.predict(pairs)
|
| 339 |
+
scores = np.array(scores, dtype=np.float32)
|
| 340 |
+
if np.any(scores < 0.0) or np.any(scores > 1.0):
|
| 341 |
+
scores = 1.0 / (1.0 + np.exp(-scores))
|
| 342 |
+
coherence = float(np.mean(scores))
|
| 343 |
+
else:
|
| 344 |
+
# Fallback: cosine similarity
|
| 345 |
+
question_embedding = self.coherence_model.encode(question, convert_to_tensor=True)
|
| 346 |
+
response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True)
|
| 347 |
+
from sentence_transformers import util
|
| 348 |
+
similarities = util.cos_sim(question_embedding, response_embeddings)[0]
|
| 349 |
+
coherence = float(np.mean(similarities.cpu().numpy()))
|
| 350 |
|
| 351 |
return max(0.0, min(1.0, coherence)) # Clamp to [0, 1]
|
| 352 |
except Exception as e:
|