Eklavya73 commited on
Commit
45993cd
·
verified ·
1 Parent(s): d699073

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Data/Domain-A_Dataset_Clean.csv filter=lfs diff=lfs merge=lfs -text
Data/Domain-A_Dataset_Clean.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a75145c90f0d6dad33433e7fa996dae9941da0cda2065b2015b327e368a19c91
3
+ size 20014738
utils/calibration_utils.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import joblib
6
+ import numpy as np
7
+
8
+ try:
9
+ from scipy.optimize import minimize_scalar
10
+ except Exception: # pragma: no cover - scipy may be unavailable in some runtimes
11
+ minimize_scalar = None
12
+
13
+
14
+ DEFAULT_EPS = 1e-6
15
+
16
+
17
+ def identity_temperature_scaler(eps: float = DEFAULT_EPS) -> dict:
18
+ return {
19
+ "temperature": 1.0,
20
+ "eps": float(eps),
21
+ "method": "logit_temperature_scaling",
22
+ "fit_objective": "binary_nll",
23
+ "fit_split": "validation",
24
+ "base_calibration": "identity",
25
+ }
26
+
27
+
28
+ def _ensure_2d(array_like) -> tuple[np.ndarray, bool]:
29
+ arr = np.asarray(array_like, dtype=float)
30
+ was_1d = arr.ndim == 1
31
+ if was_1d:
32
+ arr = arr.reshape(1, -1)
33
+ return arr, was_1d
34
+
35
+
36
+ def _restore_shape(arr: np.ndarray, was_1d: bool) -> np.ndarray:
37
+ if was_1d:
38
+ return arr.reshape(-1)
39
+ return arr
40
+
41
+
42
+ def _binary_nll(y_true, probs, eps: float = DEFAULT_EPS) -> float:
43
+ y = np.asarray(y_true, dtype=float)
44
+ p = np.clip(np.asarray(probs, dtype=float), eps, 1.0 - eps)
45
+ return float(-np.mean(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)))
46
+
47
+
48
+ def apply_per_class_calibration(raw_probs, calibrators=None):
49
+ probs, was_1d = _ensure_2d(raw_probs)
50
+ calibrated = probs.copy()
51
+
52
+ if not calibrators:
53
+ return _restore_shape(calibrated, was_1d)
54
+
55
+ for idx, calibrator in enumerate(calibrators):
56
+ if calibrator is None:
57
+ continue
58
+
59
+ values = calibrated[:, idx]
60
+ if hasattr(calibrator, "predict"):
61
+ calibrated[:, idx] = calibrator.predict(values)
62
+ else:
63
+ calibrated[:, idx] = calibrator.transform(values)
64
+
65
+ return _restore_shape(calibrated, was_1d)
66
+
67
+
68
+ def apply_temperature_scaling(probabilities, temperature_scaler=None):
69
+ probs, was_1d = _ensure_2d(probabilities)
70
+ scaler = temperature_scaler or identity_temperature_scaler()
71
+
72
+ temperature = float(scaler.get("temperature", 1.0))
73
+ eps = float(scaler.get("eps", DEFAULT_EPS))
74
+ temperature = max(temperature, eps)
75
+
76
+ clipped = np.clip(probs, eps, 1.0 - eps)
77
+ logits = np.log(clipped / (1.0 - clipped))
78
+ scaled_logits = np.clip(logits / temperature, -50.0, 50.0)
79
+ scaled = 1.0 / (1.0 + np.exp(-scaled_logits))
80
+
81
+ return _restore_shape(scaled, was_1d)
82
+
83
+
84
+ def calibrate_probabilities(raw_probs, tag_calibrators=None, temperature_scaler=None):
85
+ per_class = apply_per_class_calibration(raw_probs, tag_calibrators)
86
+ return apply_temperature_scaling(per_class, temperature_scaler)
87
+
88
+
89
+ def max_confidence(probabilities) -> float:
90
+ probs = np.asarray(probabilities, dtype=float)
91
+ if probs.size == 0:
92
+ return 0.0
93
+ return float(np.max(probs))
94
+
95
+
96
+ def fit_temperature_scaler(validation_probs, y_true, bounds=(0.5, 5.0), eps: float = DEFAULT_EPS):
97
+ probs, _ = _ensure_2d(validation_probs)
98
+ y = np.asarray(y_true, dtype=float)
99
+ if probs.shape != y.shape:
100
+ raise ValueError(
101
+ f"Shape mismatch for temperature scaling: probs={probs.shape}, y_true={y.shape}"
102
+ )
103
+
104
+ def objective(temp: float) -> float:
105
+ scaled = apply_temperature_scaling(
106
+ probs,
107
+ {"temperature": temp, "eps": eps},
108
+ )
109
+ return _binary_nll(y, scaled, eps=eps)
110
+
111
+ if minimize_scalar is not None:
112
+ result = minimize_scalar(objective, bounds=bounds, method="bounded")
113
+ best_temperature = float(result.x) if result.success else 1.0
114
+ else:
115
+ grid = np.exp(np.linspace(np.log(bounds[0]), np.log(bounds[1]), 256))
116
+ losses = np.array([objective(float(temp)) for temp in grid], dtype=float)
117
+ best_temperature = float(grid[int(losses.argmin())])
118
+
119
+ best_temperature = max(best_temperature, eps)
120
+ scaled = apply_temperature_scaling(
121
+ probs,
122
+ {"temperature": best_temperature, "eps": eps},
123
+ )
124
+
125
+ return {
126
+ "temperature": round(best_temperature, 6),
127
+ "eps": float(eps),
128
+ "method": "logit_temperature_scaling",
129
+ "fit_objective": "binary_nll",
130
+ "fit_split": "validation",
131
+ "base_calibration": "per_class_calibrator_then_temperature",
132
+ "nll_before": _binary_nll(y, probs, eps=eps),
133
+ "nll_after": _binary_nll(y, scaled, eps=eps),
134
+ "mean_conf_before": float(np.mean(np.max(probs, axis=1))),
135
+ "mean_conf_after": float(np.mean(np.max(scaled, axis=1))),
136
+ }
137
+
138
+
139
+ def load_temperature_scaler(path, default=None):
140
+ scaler_path = Path(path)
141
+ if scaler_path.exists():
142
+ loaded = joblib.load(scaler_path)
143
+ if isinstance(loaded, dict):
144
+ return loaded
145
+ return {"temperature": float(loaded), "eps": DEFAULT_EPS}
146
+ return default or identity_temperature_scaler()
utils/duplicate_detection_utils.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from pathlib import Path
5
+
6
+ import faiss
7
+ from huggingface_hub import hf_hub_download
8
+ import joblib
9
+ import numpy as np
10
+ import pandas as pd
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+ try:
14
+ from .runtime_utils import (
15
+ load_duplicate_threshold,
16
+ load_model_config,
17
+ resolve_dataset_file,
18
+ resolve_model_dir,
19
+ resolve_model_reference,
20
+ )
21
+ except ImportError: # pragma: no cover
22
+ from runtime_utils import (
23
+ load_duplicate_threshold,
24
+ load_model_config,
25
+ resolve_dataset_file,
26
+ resolve_model_dir,
27
+ resolve_model_reference,
28
+ )
29
+
30
+
31
+ class CachedDuplicateDetectionEngine:
32
+ def __init__(self, base_dir: str | Path | None = None):
33
+ self.base_dir = Path(base_dir).resolve() if base_dir is not None else Path(__file__).resolve().parent
34
+ self.model_dir = resolve_model_dir(self.base_dir)
35
+ self.model_config = load_model_config(self.base_dir)
36
+ self.duplicate_threshold = load_duplicate_threshold(self.base_dir)
37
+ from huggingface_hub import hf_hub_download
38
+ dataset_path = hf_hub_download(
39
+ repo_id="Eklavya73/ticket-duplicate-assets",
40
+ filename="Domain-A_Dataset_Clean.csv"
41
+ )
42
+
43
+ self.dataset = pd.read_csv(dataset_path)
44
+
45
+ dataset_path = hf_hub_download(
46
+ repo_id="Eklavya73/ticket-duplicate-assets",
47
+ filename="Domain-A_Dataset_Clean.csv"
48
+ )
49
+ self.dataset = pd.read_csv(dataset_path)
50
+ self.db_texts = self.dataset["text"].astype(str).tolist()
51
+
52
+ ticket_id_path = hf_hub_download(
53
+ repo_id="Eklavya73/ticket-duplicate-assets",
54
+ filename="ticket_ids.pkl"
55
+ )
56
+ self.db_ids = joblib.load(ticket_id_path)
57
+
58
+ if ticket_id_path.exists():
59
+ self.db_ids = [str(ticket_id) for ticket_id in joblib.load(ticket_id_path)]
60
+ else:
61
+ self.db_ids = self.dataset["ticket_id"].astype(str).tolist()
62
+
63
+ from huggingface_hub import hf_hub_download
64
+
65
+ file_path = hf_hub_download(
66
+ repo_id="Eklavya73/ticket-duplicate-assets",
67
+ filename="db_embeddings.npy"
68
+ )
69
+ self.db_embeddings = np.load(file_path).astype("float32").astype("float32")
70
+
71
+ if self.db_embeddings.ndim != 2:
72
+ raise ValueError(
73
+ f"Expected 2D duplicate embedding matrix, got shape={self.db_embeddings.shape}"
74
+ )
75
+ faiss.normalize_L2(self.db_embeddings)
76
+
77
+ self.embedding_dim = int(self.db_embeddings.shape[1])
78
+ self.faiss_meta = self._load_faiss_meta()
79
+ self.index = self._build_index(self.db_embeddings)
80
+ self.index.add(self.db_embeddings)
81
+ self._encoder: SentenceTransformer | None = None
82
+
83
+ def _load_faiss_meta(self) -> dict:
84
+ meta_path = self.model_dir / "faiss_index_meta.pkl"
85
+ if meta_path.exists():
86
+ loaded = joblib.load(meta_path)
87
+ if isinstance(loaded, dict):
88
+ return loaded
89
+ return {
90
+ "dimension": self.embedding_dim,
91
+ "index_type": "flat",
92
+ "size": len(self.db_texts),
93
+ }
94
+
95
+ def _build_index(self, embeddings: np.ndarray):
96
+ index_type = str(self.faiss_meta.get("index_type", "flat")).lower()
97
+ nlist = max(1, int(self.faiss_meta.get("nlist", 256)))
98
+ nprobe = max(1, int(self.faiss_meta.get("nprobe", 48)))
99
+
100
+ if index_type == "ivf" and len(embeddings) >= max(64, nlist):
101
+ quantizer = faiss.IndexFlatIP(self.embedding_dim)
102
+ index = faiss.IndexIVFFlat(
103
+ quantizer,
104
+ self.embedding_dim,
105
+ nlist,
106
+ faiss.METRIC_INNER_PRODUCT,
107
+ )
108
+ index.train(embeddings)
109
+ index.nprobe = min(nprobe, nlist)
110
+ return index
111
+
112
+ return faiss.IndexFlatIP(self.embedding_dim)
113
+
114
+ @property
115
+ def index_size(self) -> int:
116
+ return int(self.index.ntotal)
117
+
118
+ def _get_encoder(self) -> SentenceTransformer:
119
+ if self._encoder is None:
120
+ model_ref = resolve_model_reference(
121
+ self.model_config.get("duplicate_sbert_model", "Eklavya73/duplicate_sbert"),
122
+ base_dir=self.base_dir,
123
+ model_dir=self.model_dir,
124
+ default="all-mpnet-base-v2",
125
+ )
126
+ self._encoder = SentenceTransformer(model_ref)
127
+ return self._encoder
128
+
129
+ def _encode(
130
+ self,
131
+ texts,
132
+ *,
133
+ batch_size: int = 64,
134
+ show_progress_bar: bool = False,
135
+ ) -> np.ndarray:
136
+ encoder = self._get_encoder()
137
+ embeddings = encoder.encode(
138
+ list(texts),
139
+ batch_size=batch_size,
140
+ show_progress_bar=show_progress_bar,
141
+ normalize_embeddings=True,
142
+ )
143
+ return np.asarray(embeddings, dtype="float32")
144
+
145
+ def _normalize_query(self, embedding) -> np.ndarray:
146
+ query = np.asarray(embedding, dtype="float32").reshape(1, -1).copy()
147
+ faiss.normalize_L2(query)
148
+ return query
149
+
150
+ def _search(self, embedding, *, k: int = 20):
151
+ if self.index_size == 0:
152
+ return np.empty((1, 0), dtype="float32"), np.empty((1, 0), dtype=int)
153
+ query = self._normalize_query(embedding)
154
+ return self.index.search(query, min(max(1, int(k)), self.index_size))
155
+
156
+ def find_best_match(
157
+ self,
158
+ embedding,
159
+ *,
160
+ k: int = 20,
161
+ exclude_indices=None,
162
+ ) -> dict | None:
163
+ scores, indices = self._search(embedding, k=k)
164
+ excluded = set(int(idx) for idx in (exclude_indices or []))
165
+
166
+ for score, idx in zip(scores[0], indices[0]):
167
+ idx = int(idx)
168
+ if idx < 0 or idx in excluded:
169
+ continue
170
+ return {
171
+ "index": idx,
172
+ "ticket_id": self.db_ids[idx] if idx < len(self.db_ids) else None,
173
+ "duplicate_of": self.db_ids[idx] if idx < len(self.db_ids) else None,
174
+ "matched_text": self.db_texts[idx] if idx < len(self.db_texts) else None,
175
+ "similarity": float(score),
176
+ }
177
+ return None
178
+
179
+ def detect_duplicate(
180
+ self,
181
+ text: str | None = None,
182
+ *,
183
+ embedding=None,
184
+ k: int = 20,
185
+ exclude_indices=None,
186
+ ) -> dict | None:
187
+ if embedding is None:
188
+ if text is None:
189
+ raise ValueError("Either text or embedding must be provided.")
190
+ embedding = self._encode([str(text)])[0]
191
+
192
+ match = self.find_best_match(
193
+ embedding,
194
+ k=k,
195
+ exclude_indices=exclude_indices,
196
+ )
197
+ if match is None:
198
+ return None
199
+ if float(match["similarity"]) < float(self.duplicate_threshold):
200
+ return None
201
+ return match
202
+
203
+ def add_ticket(
204
+ self,
205
+ ticket_id: str,
206
+ text: str,
207
+ *,
208
+ embedding=None,
209
+ ) -> None:
210
+ if embedding is None:
211
+ embedding = self._encode([str(text)])[0]
212
+
213
+ query = self._normalize_query(embedding)
214
+ self.index.add(query)
215
+ self.db_ids.append(str(ticket_id))
216
+ self.db_texts.append(str(text))
217
+ self.db_embeddings = np.vstack([self.db_embeddings, query]).astype("float32")
218
+ self.faiss_meta["size"] = int(self.index.ntotal)
219
+
220
+ def benchmark_duplicate_detection(self, *, num_queries: int = 200, k: int = 5) -> dict:
221
+ if self.index_size <= 1:
222
+ return {
223
+ "exact_latency_ms": 0.0,
224
+ "faiss_latency_ms": 0.0,
225
+ "speedup_vs_exact": 0.0,
226
+ "recall_at_k": 0.0,
227
+ "duplicate_precision": 0.0,
228
+ "duplicate_recall": 0.0,
229
+ "duplicate_f1": 0.0,
230
+ "duplicate_eval_pairs": 0,
231
+ }
232
+
233
+ k = max(1, int(k))
234
+ rng = np.random.default_rng(42)
235
+ query_count = min(int(num_queries), self.index_size)
236
+ sampled_indices = rng.choice(self.index_size, size=query_count, replace=False)
237
+
238
+ exact_hits = 0
239
+ tp = 0
240
+ fp = 0
241
+ fn = 0
242
+ exact_latencies = []
243
+ faiss_latencies = []
244
+
245
+ for query_idx in sampled_indices:
246
+ query_embedding = self.db_embeddings[query_idx]
247
+
248
+ exact_start = time.perf_counter()
249
+ similarities = self.db_embeddings @ query_embedding
250
+ similarities[int(query_idx)] = -np.inf
251
+ exact_top = np.argsort(-similarities)[:k]
252
+ exact_score = float(similarities[int(exact_top[0])]) if exact_top.size else 0.0
253
+ exact_is_duplicate = exact_score >= float(self.duplicate_threshold)
254
+ exact_latencies.append((time.perf_counter() - exact_start) * 1000.0)
255
+
256
+ faiss_start = time.perf_counter()
257
+ distances, neighbors = self._search(query_embedding, k=k + 1)
258
+ faiss_latencies.append((time.perf_counter() - faiss_start) * 1000.0)
259
+
260
+ faiss_candidates = []
261
+ faiss_best_score = 0.0
262
+ for score, neighbor_idx in zip(distances[0], neighbors[0]):
263
+ neighbor_idx = int(neighbor_idx)
264
+ if neighbor_idx < 0 or neighbor_idx == int(query_idx):
265
+ continue
266
+ faiss_candidates.append(neighbor_idx)
267
+ if len(faiss_candidates) == 1:
268
+ faiss_best_score = float(score)
269
+ if len(faiss_candidates) >= k:
270
+ break
271
+
272
+ if exact_top.size and int(exact_top[0]) in set(faiss_candidates):
273
+ exact_hits += 1
274
+
275
+ pred_is_duplicate = bool(faiss_candidates) and faiss_best_score >= float(self.duplicate_threshold)
276
+ if pred_is_duplicate and exact_is_duplicate:
277
+ tp += 1
278
+ elif pred_is_duplicate and not exact_is_duplicate:
279
+ fp += 1
280
+ elif exact_is_duplicate and not pred_is_duplicate:
281
+ fn += 1
282
+
283
+ precision = tp / max(tp + fp, 1)
284
+ recall = tp / max(tp + fn, 1)
285
+ f1 = 0.0 if (precision + recall) == 0.0 else (2.0 * precision * recall) / (precision + recall)
286
+
287
+ exact_latency_ms = float(np.mean(exact_latencies)) if exact_latencies else 0.0
288
+ faiss_latency_ms = float(np.mean(faiss_latencies)) if faiss_latencies else 0.0
289
+
290
+ return {
291
+ "exact_latency_ms": exact_latency_ms,
292
+ "faiss_latency_ms": faiss_latency_ms,
293
+ "speedup_vs_exact": (
294
+ float(exact_latency_ms / faiss_latency_ms)
295
+ if faiss_latency_ms > 0.0
296
+ else 0.0
297
+ ),
298
+ "recall_at_k": float(exact_hits / max(query_count, 1)),
299
+ "duplicate_precision": float(precision),
300
+ "duplicate_recall": float(recall),
301
+ "duplicate_f1": float(f1),
302
+ "duplicate_eval_pairs": int(query_count),
303
+ }
304
+
305
+ def get_duplicate_metrics(self) -> dict:
306
+ return {
307
+ "duplicate_threshold": float(self.duplicate_threshold),
308
+ "faiss_meta": {
309
+ "dimension": int(self.embedding_dim),
310
+ "index_type": str(self.faiss_meta.get("index_type", "flat")),
311
+ "nlist": int(self.faiss_meta.get("nlist", 0)),
312
+ "nprobe": int(self.faiss_meta.get("nprobe", 0)),
313
+ "size": int(self.index_size),
314
+ },
315
+ }
utils/hybrid_routing_utils.py ADDED
@@ -0,0 +1,769 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ from collections import Counter, defaultdict
5
+ from pathlib import Path
6
+ from typing import Mapping
7
+
8
+ import joblib
9
+ import numpy as np
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+
13
+ DEFAULT_TAG_TO_DEPARTMENT = {
14
+ "technical_issue": "Technical_Support",
15
+ "hardware_issue": "Technical_Support",
16
+ "software_issue": "Technical_Support",
17
+ "network_issue": "Technical_Support",
18
+ "performance_issue": "Technical_Support",
19
+ "system_issue": "Technical_Support",
20
+ "configuration_issue": "Technical_Support",
21
+ "compatibility_issue": "Technical_Support",
22
+ "maintenance_issue": "Technical_Support",
23
+ "update_request": "Technical_Support",
24
+ "scalability_issue": "Technical_Support",
25
+ "synchronization_issue": "Technical_Support",
26
+ "access_issue": "IT_Support",
27
+ "authentication_issue": "IT_Support",
28
+ "security_issue": "IT_Support",
29
+ "api_issue": "IT_Support",
30
+ "data_issue": "IT_Support",
31
+ "integration_issue": "IT_Support",
32
+ "incident": "Service_Outages_And_Maintenance",
33
+ "service_request": "Customer_Service",
34
+ "general_inquiry": "Customer_Service",
35
+ "billing_issue": "Billing_And_Payments",
36
+ "refund_request": "Billing_And_Payments",
37
+ "order_issue": "Returns_And_Exchanges",
38
+ "feature_request": "Product_Support",
39
+ "training_request": "Human_Resources",
40
+ "sales_inquiry": "Sales_And_Presales",
41
+ "digital_marketing": "Marketing",
42
+ "digital_strategy": "Marketing",
43
+ }
44
+
45
+ KNOWN_DEPARTMENT_ALIASES = {
46
+ "technical_support": "Technical_Support",
47
+ "it_support": "IT_Support",
48
+ "service_outages_and_maintenance": "Service_Outages_And_Maintenance",
49
+ "customer_service": "Customer_Service",
50
+ "billing_and_payments": "Billing_And_Payments",
51
+ "returns_and_exchanges": "Returns_And_Exchanges",
52
+ "product_support": "Product_Support",
53
+ "human_resources": "Human_Resources",
54
+ "sales_and_presales": "Sales_And_Presales",
55
+ "marketing": "Marketing",
56
+ "general_inquiry": "Customer_Service",
57
+ }
58
+
59
+
60
+ def _normalize_mapping(mapping) -> dict[str, str]:
61
+ if not isinstance(mapping, Mapping):
62
+ return {}
63
+ return {
64
+ str(tag): str(department)
65
+ for tag, department in mapping.items()
66
+ }
67
+
68
+
69
+ def _extract_tag_mapping(policy_or_mapping) -> dict[str, str]:
70
+ if isinstance(policy_or_mapping, Mapping) and "tag_to_department" in policy_or_mapping:
71
+ return _normalize_mapping(policy_or_mapping.get("tag_to_department", {}))
72
+ return _normalize_mapping(policy_or_mapping)
73
+
74
+
75
+ def _normalize_department_key(value: str) -> str:
76
+ return (
77
+ str(value)
78
+ .strip()
79
+ .lower()
80
+ .replace("&", "and")
81
+ .replace("-", "_")
82
+ .replace(" ", "_")
83
+ )
84
+
85
+
86
+ def canonicalize_department_name(name, *, valid_departments=None):
87
+ if name is None:
88
+ return None
89
+
90
+ text = str(name).strip()
91
+ if not text or text.lower() == "nan":
92
+ return None
93
+
94
+ valid_departments = [str(department) for department in (valid_departments or [])]
95
+ normalized_valid = {
96
+ _normalize_department_key(department): department
97
+ for department in valid_departments
98
+ }
99
+ normalized_name = _normalize_department_key(text)
100
+
101
+ if normalized_name in normalized_valid:
102
+ return normalized_valid[normalized_name]
103
+
104
+ aliased = KNOWN_DEPARTMENT_ALIASES.get(normalized_name)
105
+ if aliased is not None:
106
+ return aliased
107
+
108
+ return text if not valid_departments else None
109
+
110
+
111
+ def parse_tag_list(value) -> list[str]:
112
+ if isinstance(value, list):
113
+ return [str(tag) for tag in value]
114
+
115
+ if value is None:
116
+ return []
117
+
118
+ if isinstance(value, str):
119
+ stripped = value.strip()
120
+ if not stripped:
121
+ return []
122
+ try:
123
+ parsed = ast.literal_eval(stripped)
124
+ if isinstance(parsed, list):
125
+ return [str(tag) for tag in parsed]
126
+ except Exception:
127
+ return []
128
+
129
+ return []
130
+
131
+
132
+ def validate_routing_label_mapping(
133
+ tag_to_department,
134
+ *,
135
+ valid_tags=None,
136
+ valid_departments=None,
137
+ ):
138
+ normalized_mapping = _normalize_mapping(tag_to_department)
139
+ valid_tag_list = (
140
+ [str(tag) for tag in valid_tags]
141
+ if valid_tags is not None
142
+ else None
143
+ )
144
+ valid_department_set = (
145
+ {str(department) for department in valid_departments}
146
+ if valid_departments is not None
147
+ else None
148
+ )
149
+
150
+ stale_tags = []
151
+ if valid_tag_list is not None:
152
+ stale_tags = [
153
+ tag for tag in normalized_mapping
154
+ if tag not in valid_tag_list
155
+ ]
156
+
157
+ invalid_departments = {}
158
+ if valid_department_set is not None:
159
+ invalid_departments = {
160
+ tag: department
161
+ for tag, department in normalized_mapping.items()
162
+ if department not in valid_department_set
163
+ }
164
+
165
+ cleaned_mapping = {
166
+ tag: department
167
+ for tag, department in normalized_mapping.items()
168
+ if tag not in stale_tags and tag not in invalid_departments
169
+ }
170
+
171
+ missing_tags = []
172
+ if valid_tag_list is not None:
173
+ missing_tags = [
174
+ tag for tag in valid_tag_list
175
+ if tag not in cleaned_mapping
176
+ ]
177
+
178
+ return cleaned_mapping, missing_tags, stale_tags, invalid_departments
179
+
180
+
181
+ def build_routing_label_policy(
182
+ *,
183
+ valid_tags,
184
+ valid_departments,
185
+ base_mapping=None,
186
+ fallback_tag_to_department=None,
187
+ default_department="Human_Review",
188
+ ):
189
+ merged_mapping = _normalize_mapping(fallback_tag_to_department)
190
+ merged_mapping.update(_normalize_mapping(base_mapping))
191
+
192
+ cleaned_mapping, missing_tags, stale_tags, invalid_departments = (
193
+ validate_routing_label_mapping(
194
+ merged_mapping,
195
+ valid_tags=valid_tags,
196
+ valid_departments=valid_departments,
197
+ )
198
+ )
199
+
200
+ return {
201
+ "tag_to_department": cleaned_mapping,
202
+ "valid_tags": [str(tag) for tag in valid_tags],
203
+ "valid_departments": [str(department) for department in valid_departments],
204
+ "default_department": str(default_department),
205
+ "missing_tags": missing_tags,
206
+ "stale_tags": stale_tags,
207
+ "invalid_departments": invalid_departments,
208
+ "mapping_count": len(cleaned_mapping),
209
+ }
210
+
211
+
212
+ def assert_valid_routing_label_policy(
213
+ policy_or_mapping,
214
+ *,
215
+ valid_tags,
216
+ valid_departments,
217
+ ):
218
+ cleaned_mapping, missing_tags, stale_tags, invalid_departments = (
219
+ validate_routing_label_mapping(
220
+ _extract_tag_mapping(policy_or_mapping),
221
+ valid_tags=valid_tags,
222
+ valid_departments=valid_departments,
223
+ )
224
+ )
225
+
226
+ if missing_tags or stale_tags or invalid_departments:
227
+ raise ValueError(
228
+ "Routing label policy validation failed: "
229
+ f"missing_tags={missing_tags}, "
230
+ f"stale_tags={stale_tags}, "
231
+ f"invalid_departments={invalid_departments}"
232
+ )
233
+
234
+ return cleaned_mapping
235
+
236
+
237
+ def assert_predicted_tags_mapped(
238
+ predicted_tags,
239
+ policy_or_mapping,
240
+ *,
241
+ valid_departments,
242
+ ):
243
+ tag_to_department = _extract_tag_mapping(policy_or_mapping)
244
+ valid_department_set = {str(department) for department in valid_departments}
245
+
246
+ missing_tags = [
247
+ str(tag) for tag in predicted_tags
248
+ if str(tag) not in tag_to_department
249
+ ]
250
+ invalid_departments = {
251
+ str(tag): tag_to_department.get(str(tag))
252
+ for tag in predicted_tags
253
+ if tag_to_department.get(str(tag)) not in valid_department_set
254
+ }
255
+
256
+ if missing_tags or invalid_departments:
257
+ raise ValueError(
258
+ "Predicted tags are not fully covered by the routing label policy: "
259
+ f"missing_tags={missing_tags}, "
260
+ f"invalid_departments={invalid_departments}"
261
+ )
262
+
263
+
264
+ def _normalize_vector(vector):
265
+ arr = np.asarray(vector, dtype=float).reshape(-1)
266
+ if arr.size == 0:
267
+ return arr
268
+ norm = np.linalg.norm(arr)
269
+ if norm == 0.0:
270
+ return arr
271
+ return arr / norm
272
+
273
+
274
+ def _build_department_semantic_vectors(
275
+ valid_departments,
276
+ *,
277
+ dept_prototypes=None,
278
+ embed_text_fn=None,
279
+ ):
280
+ department_vectors = {}
281
+
282
+ for department in valid_departments:
283
+ weighted_parts = []
284
+ if dept_prototypes is not None and department in dept_prototypes:
285
+ proto = _normalize_vector(dept_prototypes[department])
286
+ if proto.size:
287
+ weighted_parts.append((0.25, proto))
288
+
289
+ if embed_text_fn is not None:
290
+ label_vector = _normalize_vector(
291
+ embed_text_fn(str(department).replace("_", " "))
292
+ )
293
+ if label_vector.size:
294
+ weighted_parts.append((0.75, label_vector))
295
+
296
+ if not weighted_parts:
297
+ continue
298
+
299
+ if len(weighted_parts) == 1:
300
+ department_vectors[str(department)] = weighted_parts[0][1]
301
+ continue
302
+
303
+ merged = sum(weight * vector for weight, vector in weighted_parts)
304
+ merged = _normalize_vector(merged)
305
+ department_vectors[str(department)] = merged
306
+
307
+ return department_vectors
308
+
309
+
310
+ def _department_semantic_scores(tag, department_vectors, *, embed_text_fn=None):
311
+ if embed_text_fn is None or not department_vectors:
312
+ return {}
313
+
314
+ tag_vector = _normalize_vector(embed_text_fn(str(tag).replace("_", " ")))
315
+ if tag_vector.size == 0:
316
+ return {}
317
+
318
+ scores = {}
319
+ for department, department_vector in department_vectors.items():
320
+ if department_vector.shape != tag_vector.shape:
321
+ continue
322
+ raw_similarity = float(np.dot(tag_vector, department_vector))
323
+ scores[department] = normalize_semantic_similarity(raw_similarity)
324
+
325
+ return scores
326
+
327
+
328
+ def build_department_prototypes_from_tag_map(
329
+ texts,
330
+ tag_lists,
331
+ *,
332
+ tag_to_department,
333
+ embed_texts_fn,
334
+ min_examples=5,
335
+ ):
336
+ if embed_texts_fn is None:
337
+ raise ValueError("embed_texts_fn is required to build department prototypes.")
338
+
339
+ normalized_mapping = _extract_tag_mapping(tag_to_department)
340
+ department_texts: dict[str, list[str]] = defaultdict(list)
341
+
342
+ for text, tags in zip(texts, tag_lists):
343
+ matched_departments = {
344
+ normalized_mapping[tag]
345
+ for tag in parse_tag_list(tags)
346
+ if tag in normalized_mapping
347
+ }
348
+ for department in matched_departments:
349
+ department_texts[department].append(str(text))
350
+
351
+ department_prototypes = {}
352
+ for department, department_examples in department_texts.items():
353
+ if len(department_examples) < int(min_examples):
354
+ continue
355
+
356
+ embeddings = np.asarray(embed_texts_fn(department_examples), dtype=float)
357
+ if embeddings.ndim == 1:
358
+ embeddings = embeddings.reshape(1, -1)
359
+
360
+ prototype = _normalize_vector(np.mean(embeddings, axis=0))
361
+ if prototype.size == 0:
362
+ continue
363
+ department_prototypes[department] = prototype
364
+
365
+ if not department_prototypes:
366
+ raise ValueError("No department prototypes could be built from the provided texts and tags.")
367
+
368
+ return department_prototypes
369
+
370
+
371
+ def analyze_routing_label_policy(
372
+ dataset_paths,
373
+ *,
374
+ tag_classes,
375
+ valid_departments,
376
+ existing_mapping=None,
377
+ queue_column="queue",
378
+ tags_column="tags",
379
+ ):
380
+ import pandas as pd
381
+
382
+ valid_tags = [str(tag) for tag in tag_classes]
383
+ valid_departments = [str(department) for department in valid_departments]
384
+ existing_mapping = _extract_tag_mapping(existing_mapping)
385
+
386
+ tag_department_counts: dict[str, Counter] = defaultdict(Counter)
387
+ observed_departments = set()
388
+ total_rows = 0
389
+ used_rows = 0
390
+
391
+ for dataset_path in dataset_paths:
392
+ path = Path(dataset_path)
393
+ if not path.exists():
394
+ raise FileNotFoundError(f"Dataset not found for routing-label analysis: {path}")
395
+
396
+ frame = pd.read_csv(path, usecols=[queue_column, tags_column])
397
+ total_rows += len(frame)
398
+
399
+ for queue_value, tags_value in zip(frame[queue_column], frame[tags_column]):
400
+ department = canonicalize_department_name(
401
+ queue_value,
402
+ valid_departments=valid_departments,
403
+ )
404
+ tags = parse_tag_list(tags_value)
405
+ if department is None or not tags:
406
+ continue
407
+
408
+ used_rows += 1
409
+ observed_departments.add(department)
410
+
411
+ for tag in tags:
412
+ if tag in valid_tags:
413
+ tag_department_counts[tag][department] += 1
414
+
415
+ tag_statistics = {}
416
+ missing_mappings = []
417
+ unused_tags = []
418
+ majority_mismatch_tags = []
419
+
420
+ for tag in valid_tags:
421
+ counts = tag_department_counts.get(tag, Counter())
422
+ ranked = counts.most_common()
423
+ total_examples = int(sum(counts.values()))
424
+ majority_department = ranked[0][0] if ranked else None
425
+ majority_count = int(ranked[0][1]) if ranked else 0
426
+ fallback_department = ranked[1][0] if len(ranked) > 1 else None
427
+ fallback_count = int(ranked[1][1]) if len(ranked) > 1 else 0
428
+ majority_share = float(majority_count / total_examples) if total_examples else 0.0
429
+ majority_margin_share = (
430
+ float((majority_count - fallback_count) / total_examples)
431
+ if total_examples
432
+ else 0.0
433
+ )
434
+
435
+ current_department = existing_mapping.get(tag)
436
+ if current_department is None:
437
+ missing_mappings.append(tag)
438
+ if total_examples == 0:
439
+ unused_tags.append(tag)
440
+ if current_department is not None and majority_department is not None and current_department != majority_department:
441
+ majority_mismatch_tags.append(tag)
442
+
443
+ tag_statistics[tag] = {
444
+ "current_department": current_department,
445
+ "department_counts": dict(counts),
446
+ "total_examples": total_examples,
447
+ "majority_department": majority_department,
448
+ "majority_share": majority_share,
449
+ "majority_margin_share": majority_margin_share,
450
+ "fallback_department": fallback_department,
451
+ "fallback_share": float(fallback_count / total_examples) if total_examples else 0.0,
452
+ "missing_mapping": current_department is None,
453
+ "unused_tag": total_examples == 0,
454
+ }
455
+
456
+ redundant_mappings = [
457
+ tag for tag in existing_mapping
458
+ if tag not in valid_tags
459
+ ]
460
+ invalid_departments = {
461
+ tag: department
462
+ for tag, department in existing_mapping.items()
463
+ if canonicalize_department_name(department, valid_departments=valid_departments) is None
464
+ }
465
+
466
+ return {
467
+ "dataset_paths": [str(Path(path)) for path in dataset_paths],
468
+ "valid_tags": valid_tags,
469
+ "valid_departments": valid_departments,
470
+ "tag_statistics": tag_statistics,
471
+ "missing_mappings": missing_mappings,
472
+ "redundant_mappings": redundant_mappings,
473
+ "invalid_departments": invalid_departments,
474
+ "unused_tags": unused_tags,
475
+ "majority_mismatch_tags": majority_mismatch_tags,
476
+ "observed_departments": sorted(observed_departments),
477
+ "row_count": int(total_rows),
478
+ "used_row_count": int(used_rows),
479
+ }
480
+
481
+
482
+ def rebuild_routing_label_policy(
483
+ analysis,
484
+ *,
485
+ existing_mapping=None,
486
+ valid_departments=None,
487
+ dept_prototypes=None,
488
+ embed_text_fn=None,
489
+ rare_tag_threshold=500,
490
+ strong_majority_threshold=0.50,
491
+ distribution_weight=0.60,
492
+ semantic_weight=0.40,
493
+ default_department="Human_Review",
494
+ semantic_override_margin=0.05,
495
+ ):
496
+ tag_statistics = analysis.get("tag_statistics", {})
497
+ valid_tags = [str(tag) for tag in analysis.get("valid_tags", tag_statistics.keys())]
498
+ valid_departments = [
499
+ str(department)
500
+ for department in (valid_departments or analysis.get("valid_departments", []))
501
+ ]
502
+ existing_mapping = _extract_tag_mapping(existing_mapping)
503
+
504
+ department_vectors = _build_department_semantic_vectors(
505
+ valid_departments,
506
+ dept_prototypes=dept_prototypes,
507
+ embed_text_fn=embed_text_fn,
508
+ )
509
+
510
+ rebuilt_mapping = {}
511
+ tag_metadata = {}
512
+
513
+ for tag in valid_tags:
514
+ stats = tag_statistics.get(tag, {})
515
+ counts = Counter(stats.get("department_counts", {}))
516
+ total_examples = int(stats.get("total_examples", 0))
517
+ majority_department = stats.get("majority_department")
518
+ majority_share = float(stats.get("majority_share", 0.0))
519
+ semantic_scores = _department_semantic_scores(
520
+ tag,
521
+ department_vectors,
522
+ embed_text_fn=embed_text_fn,
523
+ )
524
+ semantic_department = (
525
+ max(semantic_scores, key=semantic_scores.get)
526
+ if semantic_scores
527
+ else None
528
+ )
529
+
530
+ if total_examples >= rare_tag_threshold and majority_department is not None:
531
+ primary_department = majority_department
532
+ selection_reason = (
533
+ "majority_distribution"
534
+ if majority_share >= strong_majority_threshold
535
+ else "majority_with_fallback"
536
+ )
537
+ ranked_combined = []
538
+ else:
539
+ combined_scores = {}
540
+ for department in valid_departments:
541
+ semantic_score = float(semantic_scores.get(department, 0.0))
542
+ combined_scores[department] = float(semantic_weight) * semantic_score
543
+
544
+ ranked_combined = sorted(
545
+ combined_scores.items(),
546
+ key=lambda item: item[1],
547
+ reverse=True,
548
+ )
549
+ primary_department = semantic_department or (ranked_combined[0][0] if ranked_combined else None)
550
+ current_department = existing_mapping.get(tag)
551
+ current_semantic_score = float(semantic_scores.get(current_department, 0.0))
552
+ winning_semantic_score = float(semantic_scores.get(primary_department, 0.0))
553
+ if (
554
+ current_department in valid_departments
555
+ and current_semantic_score + float(semantic_override_margin) >= winning_semantic_score
556
+ ):
557
+ primary_department = current_department
558
+ if total_examples == 0:
559
+ selection_reason = "semantic_fallback"
560
+ else:
561
+ selection_reason = "rare_tag_semantic"
562
+
563
+ if primary_department is None:
564
+ primary_department = existing_mapping.get(tag)
565
+ selection_reason = "existing_mapping_fallback"
566
+
567
+ if primary_department is None or primary_department not in valid_departments:
568
+ raise ValueError(f"Unable to determine a valid department for tag '{tag}'.")
569
+
570
+ observed_fallback = stats.get("fallback_department")
571
+ combined_fallback = (
572
+ ranked_combined[1][0]
573
+ if len(ranked_combined) > 1
574
+ else None
575
+ )
576
+ fallback_department = observed_fallback or combined_fallback
577
+ if fallback_department == primary_department:
578
+ fallback_department = None
579
+
580
+ rebuilt_mapping[tag] = primary_department
581
+ tag_metadata[tag] = {
582
+ "primary_department": primary_department,
583
+ "fallback_department": fallback_department,
584
+ "current_department": existing_mapping.get(tag),
585
+ "majority_department": majority_department,
586
+ "majority_share": majority_share,
587
+ "total_examples": total_examples,
588
+ "selection_reason": selection_reason,
589
+ "department_counts": dict(counts),
590
+ "semantic_department": semantic_department,
591
+ }
592
+
593
+ policy = build_routing_label_policy(
594
+ valid_tags=valid_tags,
595
+ valid_departments=valid_departments,
596
+ base_mapping=rebuilt_mapping,
597
+ fallback_tag_to_department=existing_mapping,
598
+ default_department=default_department,
599
+ )
600
+ policy.update(
601
+ {
602
+ "tag_metadata": tag_metadata,
603
+ "analysis_summary": {
604
+ "row_count": int(analysis.get("row_count", 0)),
605
+ "used_row_count": int(analysis.get("used_row_count", 0)),
606
+ "missing_mappings": list(analysis.get("missing_mappings", [])),
607
+ "redundant_mappings": list(analysis.get("redundant_mappings", [])),
608
+ "invalid_departments": dict(analysis.get("invalid_departments", {})),
609
+ "unused_tags": list(analysis.get("unused_tags", [])),
610
+ "majority_mismatch_tags": list(analysis.get("majority_mismatch_tags", [])),
611
+ },
612
+ "policy_name": "routing_label_policy",
613
+ }
614
+ )
615
+ return policy
616
+
617
+
618
+ def load_routing_label_policy(
619
+ policy_path,
620
+ *,
621
+ fallback_tag_to_department=None,
622
+ valid_tags=None,
623
+ valid_departments=None,
624
+ default_department="Human_Review",
625
+ ):
626
+ path = Path(policy_path) if policy_path is not None else None
627
+ loaded_policy = {}
628
+ if path is not None and path.exists():
629
+ loaded_policy = joblib.load(path)
630
+
631
+ base_mapping = _extract_tag_mapping(loaded_policy)
632
+
633
+ valid_tags = (
634
+ [str(tag) for tag in valid_tags]
635
+ if valid_tags is not None
636
+ else list(base_mapping)
637
+ )
638
+ valid_departments = (
639
+ [str(department) for department in valid_departments]
640
+ if valid_departments is not None
641
+ else []
642
+ )
643
+
644
+ policy = build_routing_label_policy(
645
+ valid_tags=valid_tags,
646
+ valid_departments=valid_departments,
647
+ base_mapping=base_mapping,
648
+ fallback_tag_to_department=fallback_tag_to_department,
649
+ default_department=default_department,
650
+ )
651
+ if isinstance(loaded_policy, Mapping):
652
+ for key, value in loaded_policy.items():
653
+ if key == "tag_to_department":
654
+ continue
655
+ policy[key] = value
656
+ return policy
657
+
658
+
659
+ def normalize_semantic_similarity(raw_similarity: float) -> float:
660
+ return float(np.clip((float(raw_similarity) + 1.0) / 2.0, 0.0, 1.0))
661
+
662
+
663
+ def _to_tag_prob_dict(tag_prob_source, tag_names=None) -> dict[str, float]:
664
+ if isinstance(tag_prob_source, Mapping):
665
+ return {str(tag): float(prob) for tag, prob in tag_prob_source.items()}
666
+
667
+ if tag_names is None:
668
+ raise ValueError("tag_names are required when tag_prob_source is not a mapping.")
669
+
670
+ probs = np.asarray(tag_prob_source, dtype=float).reshape(-1)
671
+ return {
672
+ str(tag_names[idx]): float(probs[idx])
673
+ for idx in range(min(len(tag_names), probs.size))
674
+ }
675
+
676
+
677
+ def _department_classifier_confidence(probabilities) -> float:
678
+ if len(probabilities) == 0:
679
+ return 0.0
680
+ probs = np.clip(np.asarray(probabilities, dtype=float), 0.0, 1.0)
681
+ return float(1.0 - np.prod(1.0 - probs))
682
+
683
+
684
+ def compute_department_hybrid_scores(
685
+ tag_prob_source,
686
+ embedding,
687
+ dept_prototypes,
688
+ tag_to_department=None,
689
+ *,
690
+ tag_names=None,
691
+ classifier_weight=0.7,
692
+ similarity_weight=0.3,
693
+ top_k=5,
694
+ ):
695
+ tag_to_department = tag_to_department or DEFAULT_TAG_TO_DEPARTMENT
696
+ tag_prob_dict = _to_tag_prob_dict(tag_prob_source, tag_names=tag_names)
697
+ sorted_tags = sorted(
698
+ tag_prob_dict.items(),
699
+ key=lambda item: item[1],
700
+ reverse=True,
701
+ )
702
+ if top_k is not None:
703
+ sorted_tags = sorted_tags[:top_k]
704
+
705
+ assert_predicted_tags_mapped(
706
+ [tag for tag, _ in sorted_tags],
707
+ tag_to_department,
708
+ valid_departments=dept_prototypes.keys(),
709
+ )
710
+
711
+ department_prob_lists: dict[str, list[float]] = {}
712
+ top_tag_votes = []
713
+
714
+ for tag, prob in sorted_tags:
715
+ department = tag_to_department.get(tag)
716
+ if department is None:
717
+ continue
718
+ clipped_prob = float(np.clip(prob, 0.0, 1.0))
719
+ department_prob_lists.setdefault(department, []).append(clipped_prob)
720
+ top_tag_votes.append(
721
+ {
722
+ "tag": tag,
723
+ "score": clipped_prob,
724
+ "department": department,
725
+ }
726
+ )
727
+
728
+ details = {}
729
+ candidate_departments = set(dept_prototypes) | set(department_prob_lists)
730
+ if not candidate_departments:
731
+ return None, 0.0, {}, top_tag_votes
732
+
733
+ emb = np.asarray(embedding, dtype=float).reshape(1, -1)
734
+
735
+ for department in candidate_departments:
736
+ classifier_confidence = _department_classifier_confidence(
737
+ department_prob_lists.get(department, [])
738
+ )
739
+
740
+ proto = dept_prototypes.get(department)
741
+ raw_similarity = 0.0
742
+ if proto is not None:
743
+ raw_similarity = float(
744
+ cosine_similarity(
745
+ emb,
746
+ np.asarray(proto, dtype=float).reshape(1, -1),
747
+ )[0][0]
748
+ )
749
+
750
+ semantic_similarity = normalize_semantic_similarity(raw_similarity)
751
+ hybrid_confidence = (
752
+ float(classifier_weight) * classifier_confidence
753
+ + float(similarity_weight) * semantic_similarity
754
+ )
755
+
756
+ details[department] = {
757
+ "department": department,
758
+ "classifier_confidence": float(classifier_confidence),
759
+ "semantic_similarity": float(semantic_similarity),
760
+ "raw_semantic_similarity": float(raw_similarity),
761
+ "hybrid_confidence": float(hybrid_confidence),
762
+ }
763
+
764
+ best_department = max(
765
+ details,
766
+ key=lambda dept: details[dept]["hybrid_confidence"],
767
+ )
768
+ best_hybrid_confidence = float(details[best_department]["hybrid_confidence"])
769
+ return best_department, best_hybrid_confidence, details, top_tag_votes
utils/review_policy_utils.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import joblib
6
+ import numpy as np
7
+
8
+
9
+ DEFAULT_TARGET_REVIEW_FRACTION = 0.15
10
+ DEFAULT_MIN_REVIEW_FRACTION = 0.10
11
+ DEFAULT_FALLBACK_THRESHOLD = 0.55
12
+
13
+
14
+ def build_default_review_policy(
15
+ *,
16
+ target_review_fraction: float = DEFAULT_TARGET_REVIEW_FRACTION,
17
+ min_review_fraction: float = DEFAULT_MIN_REVIEW_FRACTION,
18
+ fallback_threshold: float = DEFAULT_FALLBACK_THRESHOLD,
19
+ ):
20
+ target_review_fraction = float(
21
+ np.clip(
22
+ max(float(target_review_fraction), float(min_review_fraction)),
23
+ 0.0,
24
+ 1.0,
25
+ )
26
+ )
27
+ fallback_threshold = float(np.clip(fallback_threshold, 0.0, 1.0))
28
+ return {
29
+ "target_review_fraction": target_review_fraction,
30
+ "min_review_fraction": float(np.clip(min_review_fraction, 0.0, 1.0)),
31
+ "fallback_threshold": fallback_threshold,
32
+ "percentile_threshold": fallback_threshold,
33
+ "effective_threshold": fallback_threshold,
34
+ "percentile_review_fraction": target_review_fraction,
35
+ "effective_review_fraction": target_review_fraction,
36
+ "method": "percentile_plus_fixed_threshold",
37
+ "fit_source": "default",
38
+ "sample_size": 0,
39
+ }
40
+
41
+
42
+ def select_review_indices_by_percentile(
43
+ hybrid_confidences,
44
+ *,
45
+ target_review_fraction: float = DEFAULT_TARGET_REVIEW_FRACTION,
46
+ min_review_fraction: float = DEFAULT_MIN_REVIEW_FRACTION,
47
+ ):
48
+ scores = np.asarray(hybrid_confidences, dtype=float).reshape(-1)
49
+ if scores.size == 0:
50
+ return np.zeros(0, dtype=bool)
51
+
52
+ target_review_fraction = float(
53
+ np.clip(
54
+ max(float(target_review_fraction), float(min_review_fraction)),
55
+ 0.0,
56
+ 1.0,
57
+ )
58
+ )
59
+ review_count = int(np.ceil(scores.size * target_review_fraction))
60
+ review_count = max(review_count, int(np.ceil(scores.size * float(min_review_fraction))))
61
+ review_count = min(review_count, scores.size)
62
+
63
+ review_mask = np.zeros(scores.size, dtype=bool)
64
+ if review_count == 0:
65
+ return review_mask
66
+
67
+ review_indices = np.argsort(scores, kind="mergesort")[:review_count]
68
+ review_mask[review_indices] = True
69
+ return review_mask
70
+
71
+
72
+ def fit_review_policy(
73
+ hybrid_confidences,
74
+ *,
75
+ target_review_fraction: float = DEFAULT_TARGET_REVIEW_FRACTION,
76
+ min_review_fraction: float = DEFAULT_MIN_REVIEW_FRACTION,
77
+ fallback_threshold: float = DEFAULT_FALLBACK_THRESHOLD,
78
+ ):
79
+ scores = np.asarray(hybrid_confidences, dtype=float).reshape(-1)
80
+ if scores.size == 0:
81
+ return build_default_review_policy(
82
+ target_review_fraction=target_review_fraction,
83
+ min_review_fraction=min_review_fraction,
84
+ fallback_threshold=fallback_threshold,
85
+ )
86
+
87
+ review_mask = select_review_indices_by_percentile(
88
+ scores,
89
+ target_review_fraction=target_review_fraction,
90
+ min_review_fraction=min_review_fraction,
91
+ )
92
+ percentile_threshold = float(np.max(scores[review_mask])) if review_mask.any() else float(fallback_threshold)
93
+ fallback_threshold = float(np.clip(fallback_threshold, 0.0, 1.0))
94
+ effective_threshold = float(max(percentile_threshold, fallback_threshold))
95
+ effective_review_mask = (scores <= percentile_threshold) | (scores < fallback_threshold)
96
+
97
+ policy = build_default_review_policy(
98
+ target_review_fraction=target_review_fraction,
99
+ min_review_fraction=min_review_fraction,
100
+ fallback_threshold=fallback_threshold,
101
+ )
102
+ policy.update(
103
+ {
104
+ "percentile_threshold": percentile_threshold,
105
+ "effective_threshold": effective_threshold,
106
+ "percentile_review_fraction": float(review_mask.mean()),
107
+ "effective_review_fraction": float(effective_review_mask.mean()),
108
+ "fit_source": "validation",
109
+ "sample_size": int(scores.size),
110
+ "hybrid_confidence_mean": float(np.mean(scores)),
111
+ "hybrid_confidence_std": float(np.std(scores)),
112
+ "hybrid_confidence_min": float(np.min(scores)),
113
+ "hybrid_confidence_max": float(np.max(scores)),
114
+ }
115
+ )
116
+ return policy
117
+
118
+
119
+ def load_review_policy(path, default=None):
120
+ path = Path(path)
121
+ if path.exists():
122
+ loaded = joblib.load(path)
123
+ if isinstance(loaded, dict):
124
+ merged = build_default_review_policy()
125
+ merged.update(loaded)
126
+ return merged
127
+ return default or build_default_review_policy()
128
+
129
+
130
+ def apply_controlled_review(mode, hybrid_confidence, review_policy=None):
131
+ policy = review_policy or build_default_review_policy()
132
+ hybrid_confidence = float(np.clip(hybrid_confidence, 0.0, 1.0))
133
+
134
+ percentile_threshold = float(policy.get("percentile_threshold", DEFAULT_FALLBACK_THRESHOLD))
135
+ fallback_threshold = float(policy.get("fallback_threshold", DEFAULT_FALLBACK_THRESHOLD))
136
+ effective_threshold = float(policy.get("effective_threshold", max(percentile_threshold, fallback_threshold)))
137
+
138
+ percentile_trigger = hybrid_confidence <= percentile_threshold
139
+ fallback_trigger = hybrid_confidence < fallback_threshold
140
+ forced_human_review = mode != "HUMAN_REVIEW" and (percentile_trigger or fallback_trigger)
141
+
142
+ triggered_rules = []
143
+ if percentile_trigger:
144
+ triggered_rules.append("percentile")
145
+ if fallback_trigger:
146
+ triggered_rules.append("fixed_threshold")
147
+
148
+ final_mode = "HUMAN_REVIEW" if forced_human_review else mode
149
+ requires_review = final_mode != "AUTO_ROUTE"
150
+
151
+ if forced_human_review:
152
+ reason = (
153
+ f"Controlled review injection forced HUMAN_REVIEW: hybrid_confidence={hybrid_confidence:.4f}, "
154
+ f"percentile_threshold={percentile_threshold:.4f}, fallback_threshold={fallback_threshold:.4f}."
155
+ )
156
+ elif mode == "HUMAN_REVIEW":
157
+ reason = "Two-stage gate already routed the ticket to HUMAN_REVIEW."
158
+ elif mode == "AUTO_ROUTE_FLAGGED":
159
+ reason = "Ticket remains AUTO_ROUTE_FLAGGED after the controlled review check."
160
+ else:
161
+ reason = "Ticket passed the controlled review check and remains AUTO_ROUTE."
162
+
163
+ decision = {
164
+ "base_mode": mode,
165
+ "final_mode": final_mode,
166
+ "requires_review": requires_review,
167
+ "forced_human_review": forced_human_review,
168
+ "triggered_rules": triggered_rules,
169
+ "hybrid_confidence": hybrid_confidence,
170
+ "percentile_threshold": percentile_threshold,
171
+ "fallback_threshold": fallback_threshold,
172
+ "effective_threshold": effective_threshold,
173
+ "target_review_fraction": float(policy.get("target_review_fraction", DEFAULT_TARGET_REVIEW_FRACTION)),
174
+ "reason": reason,
175
+ }
176
+ return final_mode, requires_review, decision
utils/runtime_utils.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import joblib
8
+ import yaml
9
+
10
+ try:
11
+ from .hybrid_routing_utils import DEFAULT_TAG_TO_DEPARTMENT
12
+ except ImportError: # pragma: no cover
13
+ from hybrid_routing_utils import DEFAULT_TAG_TO_DEPARTMENT
14
+
15
+
16
+ DEFAULT_DUPLICATE_THRESHOLD = 0.7623
17
+
18
+ DEFAULT_ROUTING_CONFIG = {
19
+ "global_threshold": 0.35,
20
+ "confidence_threshold": 0.45,
21
+ "default_department": "Human_Review",
22
+ "departments": dict(DEFAULT_TAG_TO_DEPARTMENT),
23
+ "priority_escalation": {
24
+ "critical": "Escalation",
25
+ "high": None,
26
+ },
27
+ }
28
+
29
+
30
+ def _resolve_base_dir(base_dir: str | Path | None = None) -> Path:
31
+ if base_dir is None:
32
+ return Path(__file__).resolve().parent
33
+ return Path(base_dir).resolve()
34
+
35
+
36
+ def resolve_model_dir(base_dir: str | Path | None = None) -> Path:
37
+ root = _resolve_base_dir(base_dir)
38
+ candidates = [
39
+ root / "Models",
40
+ root.parent / "Models",
41
+ ]
42
+ for candidate in candidates:
43
+ if candidate.exists():
44
+ return candidate
45
+ return candidates[0]
46
+
47
+
48
+ def resolve_data_root(base_dir: str | Path | None = None) -> Path:
49
+ root = _resolve_base_dir(base_dir)
50
+ candidates = [
51
+ root / "Datasets",
52
+ root.parent / "Datasets",
53
+ ]
54
+ for candidate in candidates:
55
+ if candidate.exists():
56
+ return candidate
57
+ return candidates[0]
58
+
59
+
60
+ def resolve_dataset_file(
61
+ base_dir: str | Path | None,
62
+ filename: str,
63
+ *,
64
+ prefer_processed: bool = True,
65
+ ) -> Path:
66
+ data_root = resolve_data_root(base_dir)
67
+ ordered = []
68
+ if prefer_processed:
69
+ ordered.extend(
70
+ [
71
+ data_root / "Processed" / filename,
72
+ data_root / filename,
73
+ ]
74
+ )
75
+ else:
76
+ ordered.extend(
77
+ [
78
+ data_root / filename,
79
+ data_root / "Processed" / filename,
80
+ ]
81
+ )
82
+
83
+ for candidate in ordered:
84
+ if candidate.exists():
85
+ return candidate
86
+
87
+ raise FileNotFoundError(
88
+ f"Dataset '{filename}' not found in deployment bundle. Checked: {ordered}"
89
+ )
90
+
91
+
92
+ def load_model_config(base_dir: str | Path | None = None) -> dict[str, Any]:
93
+ model_dir = resolve_model_dir(base_dir)
94
+ config_path = model_dir / "model_config.pkl"
95
+ if not config_path.exists():
96
+ return {}
97
+ loaded = joblib.load(config_path)
98
+ return loaded if isinstance(loaded, dict) else {}
99
+
100
+
101
+ def resolve_model_reference(
102
+ model_ref: str | Path | None,
103
+ *,
104
+ base_dir: str | Path | None = None,
105
+ model_dir: str | Path | None = None,
106
+ default: str | None = None,
107
+ ) -> str:
108
+ if model_ref in (None, ""):
109
+ if default is None:
110
+ raise FileNotFoundError("No model reference was provided.")
111
+ return str(default)
112
+
113
+ raw_value = str(model_ref)
114
+ raw_path = Path(raw_value)
115
+ base_path = _resolve_base_dir(base_dir)
116
+ model_path_root = Path(model_dir).resolve() if model_dir is not None else resolve_model_dir(base_path)
117
+
118
+ candidates: list[Path] = []
119
+ if raw_path.is_absolute():
120
+ candidates.append(raw_path)
121
+ if "Models" in raw_path.parts:
122
+ model_idx = raw_path.parts.index("Models")
123
+ suffix_parts = raw_path.parts[model_idx + 1 :]
124
+ if suffix_parts:
125
+ candidates.append(model_path_root.joinpath(*suffix_parts))
126
+ candidates.append(model_path_root / raw_path.name)
127
+ candidates.append(base_path / raw_path.name)
128
+ else:
129
+ candidates.extend(
130
+ [
131
+ raw_path,
132
+ base_path / raw_path,
133
+ model_path_root / raw_path,
134
+ model_path_root / raw_path.name,
135
+ base_path / raw_path.name,
136
+ ]
137
+ )
138
+
139
+ seen = set()
140
+ for candidate in candidates:
141
+ candidate = candidate.resolve() if candidate.exists() else candidate
142
+ normalized = str(candidate)
143
+ if normalized in seen:
144
+ continue
145
+ seen.add(normalized)
146
+ if candidate.exists():
147
+ return str(candidate)
148
+
149
+ if default is not None:
150
+ return str(default)
151
+ return raw_value
152
+
153
+
154
+ def _merge_routing_config(loaded: dict[str, Any] | None) -> dict[str, Any]:
155
+ merged = {
156
+ "global_threshold": DEFAULT_ROUTING_CONFIG["global_threshold"],
157
+ "confidence_threshold": DEFAULT_ROUTING_CONFIG["confidence_threshold"],
158
+ "default_department": DEFAULT_ROUTING_CONFIG["default_department"],
159
+ "departments": dict(DEFAULT_ROUTING_CONFIG["departments"]),
160
+ "priority_escalation": dict(DEFAULT_ROUTING_CONFIG["priority_escalation"]),
161
+ }
162
+ if not isinstance(loaded, dict):
163
+ return merged
164
+
165
+ for key in ("global_threshold", "confidence_threshold", "default_department"):
166
+ if key in loaded:
167
+ merged[key] = loaded[key]
168
+
169
+ merged["departments"].update(loaded.get("departments") or {})
170
+ merged["priority_escalation"].update(loaded.get("priority_escalation") or {})
171
+ return merged
172
+
173
+
174
+ def load_routing_config(
175
+ base_dir: str | Path | None = None,
176
+ ) -> tuple[dict[str, Any], Path | None]:
177
+ root = _resolve_base_dir(base_dir)
178
+ model_dir = resolve_model_dir(root)
179
+ candidates = [
180
+ root / "config" / "routing_config.yaml",
181
+ model_dir / "routing_config.yaml",
182
+ root.parent / "config" / "routing_config.yaml",
183
+ ]
184
+
185
+ for candidate in candidates:
186
+ if not candidate.exists():
187
+ continue
188
+ with candidate.open("r", encoding="utf-8") as handle:
189
+ return _merge_routing_config(yaml.safe_load(handle)), candidate
190
+
191
+ return _merge_routing_config(None), None
192
+
193
+
194
+ def load_duplicate_threshold(base_dir: str | Path | None = None) -> float:
195
+ model_dir = resolve_model_dir(base_dir)
196
+ threshold_path = model_dir / "duplicate_thresholds.pkl"
197
+ if threshold_path.exists():
198
+ payload = joblib.load(threshold_path)
199
+ if isinstance(payload, dict):
200
+ try:
201
+ return float(payload.get("duplicate_threshold", DEFAULT_DUPLICATE_THRESHOLD))
202
+ except (TypeError, ValueError):
203
+ pass
204
+ return float(DEFAULT_DUPLICATE_THRESHOLD)
205
+
206
+
207
+ def load_metric_artifact(
208
+ base_dir: str | Path | None,
209
+ filename: str,
210
+ ) -> dict[str, Any]:
211
+ model_dir = resolve_model_dir(base_dir)
212
+ artifact_path = model_dir / filename
213
+ if not artifact_path.exists():
214
+ raise FileNotFoundError(f"Metric artifact not found: {artifact_path}")
215
+ with artifact_path.open("r", encoding="utf-8") as handle:
216
+ return json.load(handle)