EurekaPotato commited on
Commit
6d92900
·
verified ·
1 Parent(s): cd2f19e

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. text_features.py +35 -17
text_features.py CHANGED
@@ -11,14 +11,15 @@ KEY IMPROVEMENTS:
11
  import numpy as np
12
  from typing import List, Dict, Tuple
13
  from transformers import pipeline
14
- from sentence_transformers import SentenceTransformer
15
  import re
16
 
17
 
18
  class TextFeatureExtractor:
19
  """Extract 9 text features for busy detection"""
20
 
21
- def __init__(self, use_intent_model: bool = True):
 
22
  """
23
  Initialize NLP models
24
 
@@ -39,9 +40,22 @@ class TextFeatureExtractor:
39
  )
40
  print("[OK] Sentiment model loaded")
41
 
42
- # Coherence model
43
- self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
44
- print("[OK] Coherence model loaded")
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Always setup patterns — busy_keywords is needed by extract_marker_counts()
47
  self._setup_patterns()
@@ -278,8 +292,8 @@ class TextFeatureExtractor:
278
  if keyword in transcript_lower
279
  )
280
 
281
- # Normalize by total words
282
- cognitive_load = cognitive_load_count / total_words
283
  time_pressure = time_pressure_count / total_words
284
  deflection = deflection_count / total_words
285
 
@@ -319,16 +333,20 @@ class TextFeatureExtractor:
319
  return 0.5 # Neutral if no data (changed from 1.0 to be more conservative)
320
 
321
  try:
322
- # Encode question and responses
323
- question_embedding = self.coherence_model.encode(question, convert_to_tensor=True)
324
- response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True)
325
-
326
- # Calculate cosine similarity
327
- from sentence_transformers import util
328
- similarities = util.cos_sim(question_embedding, response_embeddings)[0]
329
-
330
- # Average similarity as coherence score
331
- coherence = float(np.mean(similarities.cpu().numpy()))
 
 
 
 
332
 
333
  return max(0.0, min(1.0, coherence)) # Clamp to [0, 1]
334
  except Exception as e:
 
11
  import numpy as np
12
  from typing import List, Dict, Tuple
13
  from transformers import pipeline
14
+ from sentence_transformers import SentenceTransformer, CrossEncoder
15
  import re
16
 
17
 
18
  class TextFeatureExtractor:
19
  """Extract 9 text features for busy detection"""
20
 
21
+ def __init__(self, use_intent_model: bool = True, marker_alpha: float = 1.0, marker_beta: float = 1.0,
22
+ coherence_model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
23
  """
24
  Initialize NLP models
25
 
 
40
  )
41
  print("[OK] Sentiment model loaded")
42
 
43
+ # Coherence model (cross-encoder for next-utterance plausibility)
44
+ self.coherence_model = None
45
+ self.coherence_model_is_cross = True
46
+ self.coherence_model_name = coherence_model_name
47
+ try:
48
+ self.coherence_model = CrossEncoder(self.coherence_model_name, device="cpu")
49
+ print(f"[OK] Coherence model loaded (CrossEncoder: {self.coherence_model_name})")
50
+ except Exception as e:
51
+ print(f"[WARN] CrossEncoder load failed: {e}")
52
+ self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
53
+ self.coherence_model_is_cross = False
54
+ print("[OK] Coherence model loaded (SentenceTransformer fallback)")
55
+
56
+ # Marker smoothing
57
+ self.marker_alpha = float(marker_alpha)
58
+ self.marker_beta = float(marker_beta)
59
 
60
  # Always setup patterns — busy_keywords is needed by extract_marker_counts()
61
  self._setup_patterns()
 
292
  if keyword in transcript_lower
293
  )
294
 
295
+ # Normalize by total words with smoothing (cognitive load only)
296
+ cognitive_load = (cognitive_load_count + self.marker_alpha) / (total_words + self.marker_beta)
297
  time_pressure = time_pressure_count / total_words
298
  deflection = deflection_count / total_words
299
 
 
333
  return 0.5 # Neutral if no data (changed from 1.0 to be more conservative)
334
 
335
  try:
336
+ if self.coherence_model_is_cross:
337
+ pairs = [(question, response) for response in responses]
338
+ scores = self.coherence_model.predict(pairs)
339
+ scores = np.array(scores, dtype=np.float32)
340
+ if np.any(scores < 0.0) or np.any(scores > 1.0):
341
+ scores = 1.0 / (1.0 + np.exp(-scores))
342
+ coherence = float(np.mean(scores))
343
+ else:
344
+ # Fallback: cosine similarity
345
+ question_embedding = self.coherence_model.encode(question, convert_to_tensor=True)
346
+ response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True)
347
+ from sentence_transformers import util
348
+ similarities = util.cos_sim(question_embedding, response_embeddings)[0]
349
+ coherence = float(np.mean(similarities.cpu().numpy()))
350
 
351
  return max(0.0, min(1.0, coherence)) # Clamp to [0, 1]
352
  except Exception as e: