Upload 6 files
Browse files- .gitattributes +1 -0
- Data/Domain-A_Dataset_Clean.csv +3 -0
- utils/calibration_utils.py +146 -0
- utils/duplicate_detection_utils.py +315 -0
- utils/hybrid_routing_utils.py +769 -0
- utils/review_policy_utils.py +176 -0
- utils/runtime_utils.py +216 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Data/Domain-A_Dataset_Clean.csv filter=lfs diff=lfs merge=lfs -text
|
Data/Domain-A_Dataset_Clean.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a75145c90f0d6dad33433e7fa996dae9941da0cda2065b2015b327e368a19c91
|
| 3 |
+
size 20014738
|
utils/calibration_utils.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import joblib
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from scipy.optimize import minimize_scalar
|
| 10 |
+
except Exception: # pragma: no cover - scipy may be unavailable in some runtimes
|
| 11 |
+
minimize_scalar = None
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
DEFAULT_EPS = 1e-6
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def identity_temperature_scaler(eps: float = DEFAULT_EPS) -> dict:
|
| 18 |
+
return {
|
| 19 |
+
"temperature": 1.0,
|
| 20 |
+
"eps": float(eps),
|
| 21 |
+
"method": "logit_temperature_scaling",
|
| 22 |
+
"fit_objective": "binary_nll",
|
| 23 |
+
"fit_split": "validation",
|
| 24 |
+
"base_calibration": "identity",
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _ensure_2d(array_like) -> tuple[np.ndarray, bool]:
|
| 29 |
+
arr = np.asarray(array_like, dtype=float)
|
| 30 |
+
was_1d = arr.ndim == 1
|
| 31 |
+
if was_1d:
|
| 32 |
+
arr = arr.reshape(1, -1)
|
| 33 |
+
return arr, was_1d
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _restore_shape(arr: np.ndarray, was_1d: bool) -> np.ndarray:
|
| 37 |
+
if was_1d:
|
| 38 |
+
return arr.reshape(-1)
|
| 39 |
+
return arr
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _binary_nll(y_true, probs, eps: float = DEFAULT_EPS) -> float:
|
| 43 |
+
y = np.asarray(y_true, dtype=float)
|
| 44 |
+
p = np.clip(np.asarray(probs, dtype=float), eps, 1.0 - eps)
|
| 45 |
+
return float(-np.mean(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)))
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def apply_per_class_calibration(raw_probs, calibrators=None):
|
| 49 |
+
probs, was_1d = _ensure_2d(raw_probs)
|
| 50 |
+
calibrated = probs.copy()
|
| 51 |
+
|
| 52 |
+
if not calibrators:
|
| 53 |
+
return _restore_shape(calibrated, was_1d)
|
| 54 |
+
|
| 55 |
+
for idx, calibrator in enumerate(calibrators):
|
| 56 |
+
if calibrator is None:
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
values = calibrated[:, idx]
|
| 60 |
+
if hasattr(calibrator, "predict"):
|
| 61 |
+
calibrated[:, idx] = calibrator.predict(values)
|
| 62 |
+
else:
|
| 63 |
+
calibrated[:, idx] = calibrator.transform(values)
|
| 64 |
+
|
| 65 |
+
return _restore_shape(calibrated, was_1d)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def apply_temperature_scaling(probabilities, temperature_scaler=None):
|
| 69 |
+
probs, was_1d = _ensure_2d(probabilities)
|
| 70 |
+
scaler = temperature_scaler or identity_temperature_scaler()
|
| 71 |
+
|
| 72 |
+
temperature = float(scaler.get("temperature", 1.0))
|
| 73 |
+
eps = float(scaler.get("eps", DEFAULT_EPS))
|
| 74 |
+
temperature = max(temperature, eps)
|
| 75 |
+
|
| 76 |
+
clipped = np.clip(probs, eps, 1.0 - eps)
|
| 77 |
+
logits = np.log(clipped / (1.0 - clipped))
|
| 78 |
+
scaled_logits = np.clip(logits / temperature, -50.0, 50.0)
|
| 79 |
+
scaled = 1.0 / (1.0 + np.exp(-scaled_logits))
|
| 80 |
+
|
| 81 |
+
return _restore_shape(scaled, was_1d)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def calibrate_probabilities(raw_probs, tag_calibrators=None, temperature_scaler=None):
|
| 85 |
+
per_class = apply_per_class_calibration(raw_probs, tag_calibrators)
|
| 86 |
+
return apply_temperature_scaling(per_class, temperature_scaler)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def max_confidence(probabilities) -> float:
|
| 90 |
+
probs = np.asarray(probabilities, dtype=float)
|
| 91 |
+
if probs.size == 0:
|
| 92 |
+
return 0.0
|
| 93 |
+
return float(np.max(probs))
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def fit_temperature_scaler(validation_probs, y_true, bounds=(0.5, 5.0), eps: float = DEFAULT_EPS):
|
| 97 |
+
probs, _ = _ensure_2d(validation_probs)
|
| 98 |
+
y = np.asarray(y_true, dtype=float)
|
| 99 |
+
if probs.shape != y.shape:
|
| 100 |
+
raise ValueError(
|
| 101 |
+
f"Shape mismatch for temperature scaling: probs={probs.shape}, y_true={y.shape}"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
def objective(temp: float) -> float:
|
| 105 |
+
scaled = apply_temperature_scaling(
|
| 106 |
+
probs,
|
| 107 |
+
{"temperature": temp, "eps": eps},
|
| 108 |
+
)
|
| 109 |
+
return _binary_nll(y, scaled, eps=eps)
|
| 110 |
+
|
| 111 |
+
if minimize_scalar is not None:
|
| 112 |
+
result = minimize_scalar(objective, bounds=bounds, method="bounded")
|
| 113 |
+
best_temperature = float(result.x) if result.success else 1.0
|
| 114 |
+
else:
|
| 115 |
+
grid = np.exp(np.linspace(np.log(bounds[0]), np.log(bounds[1]), 256))
|
| 116 |
+
losses = np.array([objective(float(temp)) for temp in grid], dtype=float)
|
| 117 |
+
best_temperature = float(grid[int(losses.argmin())])
|
| 118 |
+
|
| 119 |
+
best_temperature = max(best_temperature, eps)
|
| 120 |
+
scaled = apply_temperature_scaling(
|
| 121 |
+
probs,
|
| 122 |
+
{"temperature": best_temperature, "eps": eps},
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"temperature": round(best_temperature, 6),
|
| 127 |
+
"eps": float(eps),
|
| 128 |
+
"method": "logit_temperature_scaling",
|
| 129 |
+
"fit_objective": "binary_nll",
|
| 130 |
+
"fit_split": "validation",
|
| 131 |
+
"base_calibration": "per_class_calibrator_then_temperature",
|
| 132 |
+
"nll_before": _binary_nll(y, probs, eps=eps),
|
| 133 |
+
"nll_after": _binary_nll(y, scaled, eps=eps),
|
| 134 |
+
"mean_conf_before": float(np.mean(np.max(probs, axis=1))),
|
| 135 |
+
"mean_conf_after": float(np.mean(np.max(scaled, axis=1))),
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def load_temperature_scaler(path, default=None):
|
| 140 |
+
scaler_path = Path(path)
|
| 141 |
+
if scaler_path.exists():
|
| 142 |
+
loaded = joblib.load(scaler_path)
|
| 143 |
+
if isinstance(loaded, dict):
|
| 144 |
+
return loaded
|
| 145 |
+
return {"temperature": float(loaded), "eps": DEFAULT_EPS}
|
| 146 |
+
return default or identity_temperature_scaler()
|
utils/duplicate_detection_utils.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import faiss
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
import joblib
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from sentence_transformers import SentenceTransformer
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from .runtime_utils import (
|
| 15 |
+
load_duplicate_threshold,
|
| 16 |
+
load_model_config,
|
| 17 |
+
resolve_dataset_file,
|
| 18 |
+
resolve_model_dir,
|
| 19 |
+
resolve_model_reference,
|
| 20 |
+
)
|
| 21 |
+
except ImportError: # pragma: no cover
|
| 22 |
+
from runtime_utils import (
|
| 23 |
+
load_duplicate_threshold,
|
| 24 |
+
load_model_config,
|
| 25 |
+
resolve_dataset_file,
|
| 26 |
+
resolve_model_dir,
|
| 27 |
+
resolve_model_reference,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class CachedDuplicateDetectionEngine:
|
| 32 |
+
def __init__(self, base_dir: str | Path | None = None):
|
| 33 |
+
self.base_dir = Path(base_dir).resolve() if base_dir is not None else Path(__file__).resolve().parent
|
| 34 |
+
self.model_dir = resolve_model_dir(self.base_dir)
|
| 35 |
+
self.model_config = load_model_config(self.base_dir)
|
| 36 |
+
self.duplicate_threshold = load_duplicate_threshold(self.base_dir)
|
| 37 |
+
from huggingface_hub import hf_hub_download
|
| 38 |
+
dataset_path = hf_hub_download(
|
| 39 |
+
repo_id="Eklavya73/ticket-duplicate-assets",
|
| 40 |
+
filename="Domain-A_Dataset_Clean.csv"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
self.dataset = pd.read_csv(dataset_path)
|
| 44 |
+
|
| 45 |
+
dataset_path = hf_hub_download(
|
| 46 |
+
repo_id="Eklavya73/ticket-duplicate-assets",
|
| 47 |
+
filename="Domain-A_Dataset_Clean.csv"
|
| 48 |
+
)
|
| 49 |
+
self.dataset = pd.read_csv(dataset_path)
|
| 50 |
+
self.db_texts = self.dataset["text"].astype(str).tolist()
|
| 51 |
+
|
| 52 |
+
ticket_id_path = hf_hub_download(
|
| 53 |
+
repo_id="Eklavya73/ticket-duplicate-assets",
|
| 54 |
+
filename="ticket_ids.pkl"
|
| 55 |
+
)
|
| 56 |
+
self.db_ids = joblib.load(ticket_id_path)
|
| 57 |
+
|
| 58 |
+
if ticket_id_path.exists():
|
| 59 |
+
self.db_ids = [str(ticket_id) for ticket_id in joblib.load(ticket_id_path)]
|
| 60 |
+
else:
|
| 61 |
+
self.db_ids = self.dataset["ticket_id"].astype(str).tolist()
|
| 62 |
+
|
| 63 |
+
from huggingface_hub import hf_hub_download
|
| 64 |
+
|
| 65 |
+
file_path = hf_hub_download(
|
| 66 |
+
repo_id="Eklavya73/ticket-duplicate-assets",
|
| 67 |
+
filename="db_embeddings.npy"
|
| 68 |
+
)
|
| 69 |
+
self.db_embeddings = np.load(file_path).astype("float32").astype("float32")
|
| 70 |
+
|
| 71 |
+
if self.db_embeddings.ndim != 2:
|
| 72 |
+
raise ValueError(
|
| 73 |
+
f"Expected 2D duplicate embedding matrix, got shape={self.db_embeddings.shape}"
|
| 74 |
+
)
|
| 75 |
+
faiss.normalize_L2(self.db_embeddings)
|
| 76 |
+
|
| 77 |
+
self.embedding_dim = int(self.db_embeddings.shape[1])
|
| 78 |
+
self.faiss_meta = self._load_faiss_meta()
|
| 79 |
+
self.index = self._build_index(self.db_embeddings)
|
| 80 |
+
self.index.add(self.db_embeddings)
|
| 81 |
+
self._encoder: SentenceTransformer | None = None
|
| 82 |
+
|
| 83 |
+
def _load_faiss_meta(self) -> dict:
|
| 84 |
+
meta_path = self.model_dir / "faiss_index_meta.pkl"
|
| 85 |
+
if meta_path.exists():
|
| 86 |
+
loaded = joblib.load(meta_path)
|
| 87 |
+
if isinstance(loaded, dict):
|
| 88 |
+
return loaded
|
| 89 |
+
return {
|
| 90 |
+
"dimension": self.embedding_dim,
|
| 91 |
+
"index_type": "flat",
|
| 92 |
+
"size": len(self.db_texts),
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
def _build_index(self, embeddings: np.ndarray):
|
| 96 |
+
index_type = str(self.faiss_meta.get("index_type", "flat")).lower()
|
| 97 |
+
nlist = max(1, int(self.faiss_meta.get("nlist", 256)))
|
| 98 |
+
nprobe = max(1, int(self.faiss_meta.get("nprobe", 48)))
|
| 99 |
+
|
| 100 |
+
if index_type == "ivf" and len(embeddings) >= max(64, nlist):
|
| 101 |
+
quantizer = faiss.IndexFlatIP(self.embedding_dim)
|
| 102 |
+
index = faiss.IndexIVFFlat(
|
| 103 |
+
quantizer,
|
| 104 |
+
self.embedding_dim,
|
| 105 |
+
nlist,
|
| 106 |
+
faiss.METRIC_INNER_PRODUCT,
|
| 107 |
+
)
|
| 108 |
+
index.train(embeddings)
|
| 109 |
+
index.nprobe = min(nprobe, nlist)
|
| 110 |
+
return index
|
| 111 |
+
|
| 112 |
+
return faiss.IndexFlatIP(self.embedding_dim)
|
| 113 |
+
|
| 114 |
+
@property
|
| 115 |
+
def index_size(self) -> int:
|
| 116 |
+
return int(self.index.ntotal)
|
| 117 |
+
|
| 118 |
+
def _get_encoder(self) -> SentenceTransformer:
|
| 119 |
+
if self._encoder is None:
|
| 120 |
+
model_ref = resolve_model_reference(
|
| 121 |
+
self.model_config.get("duplicate_sbert_model", "Eklavya73/duplicate_sbert"),
|
| 122 |
+
base_dir=self.base_dir,
|
| 123 |
+
model_dir=self.model_dir,
|
| 124 |
+
default="all-mpnet-base-v2",
|
| 125 |
+
)
|
| 126 |
+
self._encoder = SentenceTransformer(model_ref)
|
| 127 |
+
return self._encoder
|
| 128 |
+
|
| 129 |
+
def _encode(
|
| 130 |
+
self,
|
| 131 |
+
texts,
|
| 132 |
+
*,
|
| 133 |
+
batch_size: int = 64,
|
| 134 |
+
show_progress_bar: bool = False,
|
| 135 |
+
) -> np.ndarray:
|
| 136 |
+
encoder = self._get_encoder()
|
| 137 |
+
embeddings = encoder.encode(
|
| 138 |
+
list(texts),
|
| 139 |
+
batch_size=batch_size,
|
| 140 |
+
show_progress_bar=show_progress_bar,
|
| 141 |
+
normalize_embeddings=True,
|
| 142 |
+
)
|
| 143 |
+
return np.asarray(embeddings, dtype="float32")
|
| 144 |
+
|
| 145 |
+
def _normalize_query(self, embedding) -> np.ndarray:
|
| 146 |
+
query = np.asarray(embedding, dtype="float32").reshape(1, -1).copy()
|
| 147 |
+
faiss.normalize_L2(query)
|
| 148 |
+
return query
|
| 149 |
+
|
| 150 |
+
def _search(self, embedding, *, k: int = 20):
|
| 151 |
+
if self.index_size == 0:
|
| 152 |
+
return np.empty((1, 0), dtype="float32"), np.empty((1, 0), dtype=int)
|
| 153 |
+
query = self._normalize_query(embedding)
|
| 154 |
+
return self.index.search(query, min(max(1, int(k)), self.index_size))
|
| 155 |
+
|
| 156 |
+
def find_best_match(
|
| 157 |
+
self,
|
| 158 |
+
embedding,
|
| 159 |
+
*,
|
| 160 |
+
k: int = 20,
|
| 161 |
+
exclude_indices=None,
|
| 162 |
+
) -> dict | None:
|
| 163 |
+
scores, indices = self._search(embedding, k=k)
|
| 164 |
+
excluded = set(int(idx) for idx in (exclude_indices or []))
|
| 165 |
+
|
| 166 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 167 |
+
idx = int(idx)
|
| 168 |
+
if idx < 0 or idx in excluded:
|
| 169 |
+
continue
|
| 170 |
+
return {
|
| 171 |
+
"index": idx,
|
| 172 |
+
"ticket_id": self.db_ids[idx] if idx < len(self.db_ids) else None,
|
| 173 |
+
"duplicate_of": self.db_ids[idx] if idx < len(self.db_ids) else None,
|
| 174 |
+
"matched_text": self.db_texts[idx] if idx < len(self.db_texts) else None,
|
| 175 |
+
"similarity": float(score),
|
| 176 |
+
}
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
def detect_duplicate(
|
| 180 |
+
self,
|
| 181 |
+
text: str | None = None,
|
| 182 |
+
*,
|
| 183 |
+
embedding=None,
|
| 184 |
+
k: int = 20,
|
| 185 |
+
exclude_indices=None,
|
| 186 |
+
) -> dict | None:
|
| 187 |
+
if embedding is None:
|
| 188 |
+
if text is None:
|
| 189 |
+
raise ValueError("Either text or embedding must be provided.")
|
| 190 |
+
embedding = self._encode([str(text)])[0]
|
| 191 |
+
|
| 192 |
+
match = self.find_best_match(
|
| 193 |
+
embedding,
|
| 194 |
+
k=k,
|
| 195 |
+
exclude_indices=exclude_indices,
|
| 196 |
+
)
|
| 197 |
+
if match is None:
|
| 198 |
+
return None
|
| 199 |
+
if float(match["similarity"]) < float(self.duplicate_threshold):
|
| 200 |
+
return None
|
| 201 |
+
return match
|
| 202 |
+
|
| 203 |
+
def add_ticket(
|
| 204 |
+
self,
|
| 205 |
+
ticket_id: str,
|
| 206 |
+
text: str,
|
| 207 |
+
*,
|
| 208 |
+
embedding=None,
|
| 209 |
+
) -> None:
|
| 210 |
+
if embedding is None:
|
| 211 |
+
embedding = self._encode([str(text)])[0]
|
| 212 |
+
|
| 213 |
+
query = self._normalize_query(embedding)
|
| 214 |
+
self.index.add(query)
|
| 215 |
+
self.db_ids.append(str(ticket_id))
|
| 216 |
+
self.db_texts.append(str(text))
|
| 217 |
+
self.db_embeddings = np.vstack([self.db_embeddings, query]).astype("float32")
|
| 218 |
+
self.faiss_meta["size"] = int(self.index.ntotal)
|
| 219 |
+
|
| 220 |
+
def benchmark_duplicate_detection(self, *, num_queries: int = 200, k: int = 5) -> dict:
|
| 221 |
+
if self.index_size <= 1:
|
| 222 |
+
return {
|
| 223 |
+
"exact_latency_ms": 0.0,
|
| 224 |
+
"faiss_latency_ms": 0.0,
|
| 225 |
+
"speedup_vs_exact": 0.0,
|
| 226 |
+
"recall_at_k": 0.0,
|
| 227 |
+
"duplicate_precision": 0.0,
|
| 228 |
+
"duplicate_recall": 0.0,
|
| 229 |
+
"duplicate_f1": 0.0,
|
| 230 |
+
"duplicate_eval_pairs": 0,
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
k = max(1, int(k))
|
| 234 |
+
rng = np.random.default_rng(42)
|
| 235 |
+
query_count = min(int(num_queries), self.index_size)
|
| 236 |
+
sampled_indices = rng.choice(self.index_size, size=query_count, replace=False)
|
| 237 |
+
|
| 238 |
+
exact_hits = 0
|
| 239 |
+
tp = 0
|
| 240 |
+
fp = 0
|
| 241 |
+
fn = 0
|
| 242 |
+
exact_latencies = []
|
| 243 |
+
faiss_latencies = []
|
| 244 |
+
|
| 245 |
+
for query_idx in sampled_indices:
|
| 246 |
+
query_embedding = self.db_embeddings[query_idx]
|
| 247 |
+
|
| 248 |
+
exact_start = time.perf_counter()
|
| 249 |
+
similarities = self.db_embeddings @ query_embedding
|
| 250 |
+
similarities[int(query_idx)] = -np.inf
|
| 251 |
+
exact_top = np.argsort(-similarities)[:k]
|
| 252 |
+
exact_score = float(similarities[int(exact_top[0])]) if exact_top.size else 0.0
|
| 253 |
+
exact_is_duplicate = exact_score >= float(self.duplicate_threshold)
|
| 254 |
+
exact_latencies.append((time.perf_counter() - exact_start) * 1000.0)
|
| 255 |
+
|
| 256 |
+
faiss_start = time.perf_counter()
|
| 257 |
+
distances, neighbors = self._search(query_embedding, k=k + 1)
|
| 258 |
+
faiss_latencies.append((time.perf_counter() - faiss_start) * 1000.0)
|
| 259 |
+
|
| 260 |
+
faiss_candidates = []
|
| 261 |
+
faiss_best_score = 0.0
|
| 262 |
+
for score, neighbor_idx in zip(distances[0], neighbors[0]):
|
| 263 |
+
neighbor_idx = int(neighbor_idx)
|
| 264 |
+
if neighbor_idx < 0 or neighbor_idx == int(query_idx):
|
| 265 |
+
continue
|
| 266 |
+
faiss_candidates.append(neighbor_idx)
|
| 267 |
+
if len(faiss_candidates) == 1:
|
| 268 |
+
faiss_best_score = float(score)
|
| 269 |
+
if len(faiss_candidates) >= k:
|
| 270 |
+
break
|
| 271 |
+
|
| 272 |
+
if exact_top.size and int(exact_top[0]) in set(faiss_candidates):
|
| 273 |
+
exact_hits += 1
|
| 274 |
+
|
| 275 |
+
pred_is_duplicate = bool(faiss_candidates) and faiss_best_score >= float(self.duplicate_threshold)
|
| 276 |
+
if pred_is_duplicate and exact_is_duplicate:
|
| 277 |
+
tp += 1
|
| 278 |
+
elif pred_is_duplicate and not exact_is_duplicate:
|
| 279 |
+
fp += 1
|
| 280 |
+
elif exact_is_duplicate and not pred_is_duplicate:
|
| 281 |
+
fn += 1
|
| 282 |
+
|
| 283 |
+
precision = tp / max(tp + fp, 1)
|
| 284 |
+
recall = tp / max(tp + fn, 1)
|
| 285 |
+
f1 = 0.0 if (precision + recall) == 0.0 else (2.0 * precision * recall) / (precision + recall)
|
| 286 |
+
|
| 287 |
+
exact_latency_ms = float(np.mean(exact_latencies)) if exact_latencies else 0.0
|
| 288 |
+
faiss_latency_ms = float(np.mean(faiss_latencies)) if faiss_latencies else 0.0
|
| 289 |
+
|
| 290 |
+
return {
|
| 291 |
+
"exact_latency_ms": exact_latency_ms,
|
| 292 |
+
"faiss_latency_ms": faiss_latency_ms,
|
| 293 |
+
"speedup_vs_exact": (
|
| 294 |
+
float(exact_latency_ms / faiss_latency_ms)
|
| 295 |
+
if faiss_latency_ms > 0.0
|
| 296 |
+
else 0.0
|
| 297 |
+
),
|
| 298 |
+
"recall_at_k": float(exact_hits / max(query_count, 1)),
|
| 299 |
+
"duplicate_precision": float(precision),
|
| 300 |
+
"duplicate_recall": float(recall),
|
| 301 |
+
"duplicate_f1": float(f1),
|
| 302 |
+
"duplicate_eval_pairs": int(query_count),
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
def get_duplicate_metrics(self) -> dict:
|
| 306 |
+
return {
|
| 307 |
+
"duplicate_threshold": float(self.duplicate_threshold),
|
| 308 |
+
"faiss_meta": {
|
| 309 |
+
"dimension": int(self.embedding_dim),
|
| 310 |
+
"index_type": str(self.faiss_meta.get("index_type", "flat")),
|
| 311 |
+
"nlist": int(self.faiss_meta.get("nlist", 0)),
|
| 312 |
+
"nprobe": int(self.faiss_meta.get("nprobe", 0)),
|
| 313 |
+
"size": int(self.index_size),
|
| 314 |
+
},
|
| 315 |
+
}
|
utils/hybrid_routing_utils.py
ADDED
|
@@ -0,0 +1,769 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import ast
|
| 4 |
+
from collections import Counter, defaultdict
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Mapping
|
| 7 |
+
|
| 8 |
+
import joblib
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
DEFAULT_TAG_TO_DEPARTMENT = {
|
| 14 |
+
"technical_issue": "Technical_Support",
|
| 15 |
+
"hardware_issue": "Technical_Support",
|
| 16 |
+
"software_issue": "Technical_Support",
|
| 17 |
+
"network_issue": "Technical_Support",
|
| 18 |
+
"performance_issue": "Technical_Support",
|
| 19 |
+
"system_issue": "Technical_Support",
|
| 20 |
+
"configuration_issue": "Technical_Support",
|
| 21 |
+
"compatibility_issue": "Technical_Support",
|
| 22 |
+
"maintenance_issue": "Technical_Support",
|
| 23 |
+
"update_request": "Technical_Support",
|
| 24 |
+
"scalability_issue": "Technical_Support",
|
| 25 |
+
"synchronization_issue": "Technical_Support",
|
| 26 |
+
"access_issue": "IT_Support",
|
| 27 |
+
"authentication_issue": "IT_Support",
|
| 28 |
+
"security_issue": "IT_Support",
|
| 29 |
+
"api_issue": "IT_Support",
|
| 30 |
+
"data_issue": "IT_Support",
|
| 31 |
+
"integration_issue": "IT_Support",
|
| 32 |
+
"incident": "Service_Outages_And_Maintenance",
|
| 33 |
+
"service_request": "Customer_Service",
|
| 34 |
+
"general_inquiry": "Customer_Service",
|
| 35 |
+
"billing_issue": "Billing_And_Payments",
|
| 36 |
+
"refund_request": "Billing_And_Payments",
|
| 37 |
+
"order_issue": "Returns_And_Exchanges",
|
| 38 |
+
"feature_request": "Product_Support",
|
| 39 |
+
"training_request": "Human_Resources",
|
| 40 |
+
"sales_inquiry": "Sales_And_Presales",
|
| 41 |
+
"digital_marketing": "Marketing",
|
| 42 |
+
"digital_strategy": "Marketing",
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
KNOWN_DEPARTMENT_ALIASES = {
|
| 46 |
+
"technical_support": "Technical_Support",
|
| 47 |
+
"it_support": "IT_Support",
|
| 48 |
+
"service_outages_and_maintenance": "Service_Outages_And_Maintenance",
|
| 49 |
+
"customer_service": "Customer_Service",
|
| 50 |
+
"billing_and_payments": "Billing_And_Payments",
|
| 51 |
+
"returns_and_exchanges": "Returns_And_Exchanges",
|
| 52 |
+
"product_support": "Product_Support",
|
| 53 |
+
"human_resources": "Human_Resources",
|
| 54 |
+
"sales_and_presales": "Sales_And_Presales",
|
| 55 |
+
"marketing": "Marketing",
|
| 56 |
+
"general_inquiry": "Customer_Service",
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _normalize_mapping(mapping) -> dict[str, str]:
|
| 61 |
+
if not isinstance(mapping, Mapping):
|
| 62 |
+
return {}
|
| 63 |
+
return {
|
| 64 |
+
str(tag): str(department)
|
| 65 |
+
for tag, department in mapping.items()
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _extract_tag_mapping(policy_or_mapping) -> dict[str, str]:
|
| 70 |
+
if isinstance(policy_or_mapping, Mapping) and "tag_to_department" in policy_or_mapping:
|
| 71 |
+
return _normalize_mapping(policy_or_mapping.get("tag_to_department", {}))
|
| 72 |
+
return _normalize_mapping(policy_or_mapping)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _normalize_department_key(value: str) -> str:
|
| 76 |
+
return (
|
| 77 |
+
str(value)
|
| 78 |
+
.strip()
|
| 79 |
+
.lower()
|
| 80 |
+
.replace("&", "and")
|
| 81 |
+
.replace("-", "_")
|
| 82 |
+
.replace(" ", "_")
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def canonicalize_department_name(name, *, valid_departments=None):
|
| 87 |
+
if name is None:
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
text = str(name).strip()
|
| 91 |
+
if not text or text.lower() == "nan":
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
valid_departments = [str(department) for department in (valid_departments or [])]
|
| 95 |
+
normalized_valid = {
|
| 96 |
+
_normalize_department_key(department): department
|
| 97 |
+
for department in valid_departments
|
| 98 |
+
}
|
| 99 |
+
normalized_name = _normalize_department_key(text)
|
| 100 |
+
|
| 101 |
+
if normalized_name in normalized_valid:
|
| 102 |
+
return normalized_valid[normalized_name]
|
| 103 |
+
|
| 104 |
+
aliased = KNOWN_DEPARTMENT_ALIASES.get(normalized_name)
|
| 105 |
+
if aliased is not None:
|
| 106 |
+
return aliased
|
| 107 |
+
|
| 108 |
+
return text if not valid_departments else None
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def parse_tag_list(value) -> list[str]:
|
| 112 |
+
if isinstance(value, list):
|
| 113 |
+
return [str(tag) for tag in value]
|
| 114 |
+
|
| 115 |
+
if value is None:
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
if isinstance(value, str):
|
| 119 |
+
stripped = value.strip()
|
| 120 |
+
if not stripped:
|
| 121 |
+
return []
|
| 122 |
+
try:
|
| 123 |
+
parsed = ast.literal_eval(stripped)
|
| 124 |
+
if isinstance(parsed, list):
|
| 125 |
+
return [str(tag) for tag in parsed]
|
| 126 |
+
except Exception:
|
| 127 |
+
return []
|
| 128 |
+
|
| 129 |
+
return []
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def validate_routing_label_mapping(
|
| 133 |
+
tag_to_department,
|
| 134 |
+
*,
|
| 135 |
+
valid_tags=None,
|
| 136 |
+
valid_departments=None,
|
| 137 |
+
):
|
| 138 |
+
normalized_mapping = _normalize_mapping(tag_to_department)
|
| 139 |
+
valid_tag_list = (
|
| 140 |
+
[str(tag) for tag in valid_tags]
|
| 141 |
+
if valid_tags is not None
|
| 142 |
+
else None
|
| 143 |
+
)
|
| 144 |
+
valid_department_set = (
|
| 145 |
+
{str(department) for department in valid_departments}
|
| 146 |
+
if valid_departments is not None
|
| 147 |
+
else None
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
stale_tags = []
|
| 151 |
+
if valid_tag_list is not None:
|
| 152 |
+
stale_tags = [
|
| 153 |
+
tag for tag in normalized_mapping
|
| 154 |
+
if tag not in valid_tag_list
|
| 155 |
+
]
|
| 156 |
+
|
| 157 |
+
invalid_departments = {}
|
| 158 |
+
if valid_department_set is not None:
|
| 159 |
+
invalid_departments = {
|
| 160 |
+
tag: department
|
| 161 |
+
for tag, department in normalized_mapping.items()
|
| 162 |
+
if department not in valid_department_set
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
cleaned_mapping = {
|
| 166 |
+
tag: department
|
| 167 |
+
for tag, department in normalized_mapping.items()
|
| 168 |
+
if tag not in stale_tags and tag not in invalid_departments
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
missing_tags = []
|
| 172 |
+
if valid_tag_list is not None:
|
| 173 |
+
missing_tags = [
|
| 174 |
+
tag for tag in valid_tag_list
|
| 175 |
+
if tag not in cleaned_mapping
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
return cleaned_mapping, missing_tags, stale_tags, invalid_departments
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def build_routing_label_policy(
|
| 182 |
+
*,
|
| 183 |
+
valid_tags,
|
| 184 |
+
valid_departments,
|
| 185 |
+
base_mapping=None,
|
| 186 |
+
fallback_tag_to_department=None,
|
| 187 |
+
default_department="Human_Review",
|
| 188 |
+
):
|
| 189 |
+
merged_mapping = _normalize_mapping(fallback_tag_to_department)
|
| 190 |
+
merged_mapping.update(_normalize_mapping(base_mapping))
|
| 191 |
+
|
| 192 |
+
cleaned_mapping, missing_tags, stale_tags, invalid_departments = (
|
| 193 |
+
validate_routing_label_mapping(
|
| 194 |
+
merged_mapping,
|
| 195 |
+
valid_tags=valid_tags,
|
| 196 |
+
valid_departments=valid_departments,
|
| 197 |
+
)
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
return {
|
| 201 |
+
"tag_to_department": cleaned_mapping,
|
| 202 |
+
"valid_tags": [str(tag) for tag in valid_tags],
|
| 203 |
+
"valid_departments": [str(department) for department in valid_departments],
|
| 204 |
+
"default_department": str(default_department),
|
| 205 |
+
"missing_tags": missing_tags,
|
| 206 |
+
"stale_tags": stale_tags,
|
| 207 |
+
"invalid_departments": invalid_departments,
|
| 208 |
+
"mapping_count": len(cleaned_mapping),
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def assert_valid_routing_label_policy(
|
| 213 |
+
policy_or_mapping,
|
| 214 |
+
*,
|
| 215 |
+
valid_tags,
|
| 216 |
+
valid_departments,
|
| 217 |
+
):
|
| 218 |
+
cleaned_mapping, missing_tags, stale_tags, invalid_departments = (
|
| 219 |
+
validate_routing_label_mapping(
|
| 220 |
+
_extract_tag_mapping(policy_or_mapping),
|
| 221 |
+
valid_tags=valid_tags,
|
| 222 |
+
valid_departments=valid_departments,
|
| 223 |
+
)
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
if missing_tags or stale_tags or invalid_departments:
|
| 227 |
+
raise ValueError(
|
| 228 |
+
"Routing label policy validation failed: "
|
| 229 |
+
f"missing_tags={missing_tags}, "
|
| 230 |
+
f"stale_tags={stale_tags}, "
|
| 231 |
+
f"invalid_departments={invalid_departments}"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
return cleaned_mapping
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def assert_predicted_tags_mapped(
|
| 238 |
+
predicted_tags,
|
| 239 |
+
policy_or_mapping,
|
| 240 |
+
*,
|
| 241 |
+
valid_departments,
|
| 242 |
+
):
|
| 243 |
+
tag_to_department = _extract_tag_mapping(policy_or_mapping)
|
| 244 |
+
valid_department_set = {str(department) for department in valid_departments}
|
| 245 |
+
|
| 246 |
+
missing_tags = [
|
| 247 |
+
str(tag) for tag in predicted_tags
|
| 248 |
+
if str(tag) not in tag_to_department
|
| 249 |
+
]
|
| 250 |
+
invalid_departments = {
|
| 251 |
+
str(tag): tag_to_department.get(str(tag))
|
| 252 |
+
for tag in predicted_tags
|
| 253 |
+
if tag_to_department.get(str(tag)) not in valid_department_set
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
if missing_tags or invalid_departments:
|
| 257 |
+
raise ValueError(
|
| 258 |
+
"Predicted tags are not fully covered by the routing label policy: "
|
| 259 |
+
f"missing_tags={missing_tags}, "
|
| 260 |
+
f"invalid_departments={invalid_departments}"
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _normalize_vector(vector):
|
| 265 |
+
arr = np.asarray(vector, dtype=float).reshape(-1)
|
| 266 |
+
if arr.size == 0:
|
| 267 |
+
return arr
|
| 268 |
+
norm = np.linalg.norm(arr)
|
| 269 |
+
if norm == 0.0:
|
| 270 |
+
return arr
|
| 271 |
+
return arr / norm
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def _build_department_semantic_vectors(
|
| 275 |
+
valid_departments,
|
| 276 |
+
*,
|
| 277 |
+
dept_prototypes=None,
|
| 278 |
+
embed_text_fn=None,
|
| 279 |
+
):
|
| 280 |
+
department_vectors = {}
|
| 281 |
+
|
| 282 |
+
for department in valid_departments:
|
| 283 |
+
weighted_parts = []
|
| 284 |
+
if dept_prototypes is not None and department in dept_prototypes:
|
| 285 |
+
proto = _normalize_vector(dept_prototypes[department])
|
| 286 |
+
if proto.size:
|
| 287 |
+
weighted_parts.append((0.25, proto))
|
| 288 |
+
|
| 289 |
+
if embed_text_fn is not None:
|
| 290 |
+
label_vector = _normalize_vector(
|
| 291 |
+
embed_text_fn(str(department).replace("_", " "))
|
| 292 |
+
)
|
| 293 |
+
if label_vector.size:
|
| 294 |
+
weighted_parts.append((0.75, label_vector))
|
| 295 |
+
|
| 296 |
+
if not weighted_parts:
|
| 297 |
+
continue
|
| 298 |
+
|
| 299 |
+
if len(weighted_parts) == 1:
|
| 300 |
+
department_vectors[str(department)] = weighted_parts[0][1]
|
| 301 |
+
continue
|
| 302 |
+
|
| 303 |
+
merged = sum(weight * vector for weight, vector in weighted_parts)
|
| 304 |
+
merged = _normalize_vector(merged)
|
| 305 |
+
department_vectors[str(department)] = merged
|
| 306 |
+
|
| 307 |
+
return department_vectors
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def _department_semantic_scores(tag, department_vectors, *, embed_text_fn=None):
|
| 311 |
+
if embed_text_fn is None or not department_vectors:
|
| 312 |
+
return {}
|
| 313 |
+
|
| 314 |
+
tag_vector = _normalize_vector(embed_text_fn(str(tag).replace("_", " ")))
|
| 315 |
+
if tag_vector.size == 0:
|
| 316 |
+
return {}
|
| 317 |
+
|
| 318 |
+
scores = {}
|
| 319 |
+
for department, department_vector in department_vectors.items():
|
| 320 |
+
if department_vector.shape != tag_vector.shape:
|
| 321 |
+
continue
|
| 322 |
+
raw_similarity = float(np.dot(tag_vector, department_vector))
|
| 323 |
+
scores[department] = normalize_semantic_similarity(raw_similarity)
|
| 324 |
+
|
| 325 |
+
return scores
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def build_department_prototypes_from_tag_map(
|
| 329 |
+
texts,
|
| 330 |
+
tag_lists,
|
| 331 |
+
*,
|
| 332 |
+
tag_to_department,
|
| 333 |
+
embed_texts_fn,
|
| 334 |
+
min_examples=5,
|
| 335 |
+
):
|
| 336 |
+
if embed_texts_fn is None:
|
| 337 |
+
raise ValueError("embed_texts_fn is required to build department prototypes.")
|
| 338 |
+
|
| 339 |
+
normalized_mapping = _extract_tag_mapping(tag_to_department)
|
| 340 |
+
department_texts: dict[str, list[str]] = defaultdict(list)
|
| 341 |
+
|
| 342 |
+
for text, tags in zip(texts, tag_lists):
|
| 343 |
+
matched_departments = {
|
| 344 |
+
normalized_mapping[tag]
|
| 345 |
+
for tag in parse_tag_list(tags)
|
| 346 |
+
if tag in normalized_mapping
|
| 347 |
+
}
|
| 348 |
+
for department in matched_departments:
|
| 349 |
+
department_texts[department].append(str(text))
|
| 350 |
+
|
| 351 |
+
department_prototypes = {}
|
| 352 |
+
for department, department_examples in department_texts.items():
|
| 353 |
+
if len(department_examples) < int(min_examples):
|
| 354 |
+
continue
|
| 355 |
+
|
| 356 |
+
embeddings = np.asarray(embed_texts_fn(department_examples), dtype=float)
|
| 357 |
+
if embeddings.ndim == 1:
|
| 358 |
+
embeddings = embeddings.reshape(1, -1)
|
| 359 |
+
|
| 360 |
+
prototype = _normalize_vector(np.mean(embeddings, axis=0))
|
| 361 |
+
if prototype.size == 0:
|
| 362 |
+
continue
|
| 363 |
+
department_prototypes[department] = prototype
|
| 364 |
+
|
| 365 |
+
if not department_prototypes:
|
| 366 |
+
raise ValueError("No department prototypes could be built from the provided texts and tags.")
|
| 367 |
+
|
| 368 |
+
return department_prototypes
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def analyze_routing_label_policy(
|
| 372 |
+
dataset_paths,
|
| 373 |
+
*,
|
| 374 |
+
tag_classes,
|
| 375 |
+
valid_departments,
|
| 376 |
+
existing_mapping=None,
|
| 377 |
+
queue_column="queue",
|
| 378 |
+
tags_column="tags",
|
| 379 |
+
):
|
| 380 |
+
import pandas as pd
|
| 381 |
+
|
| 382 |
+
valid_tags = [str(tag) for tag in tag_classes]
|
| 383 |
+
valid_departments = [str(department) for department in valid_departments]
|
| 384 |
+
existing_mapping = _extract_tag_mapping(existing_mapping)
|
| 385 |
+
|
| 386 |
+
tag_department_counts: dict[str, Counter] = defaultdict(Counter)
|
| 387 |
+
observed_departments = set()
|
| 388 |
+
total_rows = 0
|
| 389 |
+
used_rows = 0
|
| 390 |
+
|
| 391 |
+
for dataset_path in dataset_paths:
|
| 392 |
+
path = Path(dataset_path)
|
| 393 |
+
if not path.exists():
|
| 394 |
+
raise FileNotFoundError(f"Dataset not found for routing-label analysis: {path}")
|
| 395 |
+
|
| 396 |
+
frame = pd.read_csv(path, usecols=[queue_column, tags_column])
|
| 397 |
+
total_rows += len(frame)
|
| 398 |
+
|
| 399 |
+
for queue_value, tags_value in zip(frame[queue_column], frame[tags_column]):
|
| 400 |
+
department = canonicalize_department_name(
|
| 401 |
+
queue_value,
|
| 402 |
+
valid_departments=valid_departments,
|
| 403 |
+
)
|
| 404 |
+
tags = parse_tag_list(tags_value)
|
| 405 |
+
if department is None or not tags:
|
| 406 |
+
continue
|
| 407 |
+
|
| 408 |
+
used_rows += 1
|
| 409 |
+
observed_departments.add(department)
|
| 410 |
+
|
| 411 |
+
for tag in tags:
|
| 412 |
+
if tag in valid_tags:
|
| 413 |
+
tag_department_counts[tag][department] += 1
|
| 414 |
+
|
| 415 |
+
tag_statistics = {}
|
| 416 |
+
missing_mappings = []
|
| 417 |
+
unused_tags = []
|
| 418 |
+
majority_mismatch_tags = []
|
| 419 |
+
|
| 420 |
+
for tag in valid_tags:
|
| 421 |
+
counts = tag_department_counts.get(tag, Counter())
|
| 422 |
+
ranked = counts.most_common()
|
| 423 |
+
total_examples = int(sum(counts.values()))
|
| 424 |
+
majority_department = ranked[0][0] if ranked else None
|
| 425 |
+
majority_count = int(ranked[0][1]) if ranked else 0
|
| 426 |
+
fallback_department = ranked[1][0] if len(ranked) > 1 else None
|
| 427 |
+
fallback_count = int(ranked[1][1]) if len(ranked) > 1 else 0
|
| 428 |
+
majority_share = float(majority_count / total_examples) if total_examples else 0.0
|
| 429 |
+
majority_margin_share = (
|
| 430 |
+
float((majority_count - fallback_count) / total_examples)
|
| 431 |
+
if total_examples
|
| 432 |
+
else 0.0
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
current_department = existing_mapping.get(tag)
|
| 436 |
+
if current_department is None:
|
| 437 |
+
missing_mappings.append(tag)
|
| 438 |
+
if total_examples == 0:
|
| 439 |
+
unused_tags.append(tag)
|
| 440 |
+
if current_department is not None and majority_department is not None and current_department != majority_department:
|
| 441 |
+
majority_mismatch_tags.append(tag)
|
| 442 |
+
|
| 443 |
+
tag_statistics[tag] = {
|
| 444 |
+
"current_department": current_department,
|
| 445 |
+
"department_counts": dict(counts),
|
| 446 |
+
"total_examples": total_examples,
|
| 447 |
+
"majority_department": majority_department,
|
| 448 |
+
"majority_share": majority_share,
|
| 449 |
+
"majority_margin_share": majority_margin_share,
|
| 450 |
+
"fallback_department": fallback_department,
|
| 451 |
+
"fallback_share": float(fallback_count / total_examples) if total_examples else 0.0,
|
| 452 |
+
"missing_mapping": current_department is None,
|
| 453 |
+
"unused_tag": total_examples == 0,
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
redundant_mappings = [
|
| 457 |
+
tag for tag in existing_mapping
|
| 458 |
+
if tag not in valid_tags
|
| 459 |
+
]
|
| 460 |
+
invalid_departments = {
|
| 461 |
+
tag: department
|
| 462 |
+
for tag, department in existing_mapping.items()
|
| 463 |
+
if canonicalize_department_name(department, valid_departments=valid_departments) is None
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
return {
|
| 467 |
+
"dataset_paths": [str(Path(path)) for path in dataset_paths],
|
| 468 |
+
"valid_tags": valid_tags,
|
| 469 |
+
"valid_departments": valid_departments,
|
| 470 |
+
"tag_statistics": tag_statistics,
|
| 471 |
+
"missing_mappings": missing_mappings,
|
| 472 |
+
"redundant_mappings": redundant_mappings,
|
| 473 |
+
"invalid_departments": invalid_departments,
|
| 474 |
+
"unused_tags": unused_tags,
|
| 475 |
+
"majority_mismatch_tags": majority_mismatch_tags,
|
| 476 |
+
"observed_departments": sorted(observed_departments),
|
| 477 |
+
"row_count": int(total_rows),
|
| 478 |
+
"used_row_count": int(used_rows),
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
def rebuild_routing_label_policy(
|
| 483 |
+
analysis,
|
| 484 |
+
*,
|
| 485 |
+
existing_mapping=None,
|
| 486 |
+
valid_departments=None,
|
| 487 |
+
dept_prototypes=None,
|
| 488 |
+
embed_text_fn=None,
|
| 489 |
+
rare_tag_threshold=500,
|
| 490 |
+
strong_majority_threshold=0.50,
|
| 491 |
+
distribution_weight=0.60,
|
| 492 |
+
semantic_weight=0.40,
|
| 493 |
+
default_department="Human_Review",
|
| 494 |
+
semantic_override_margin=0.05,
|
| 495 |
+
):
|
| 496 |
+
tag_statistics = analysis.get("tag_statistics", {})
|
| 497 |
+
valid_tags = [str(tag) for tag in analysis.get("valid_tags", tag_statistics.keys())]
|
| 498 |
+
valid_departments = [
|
| 499 |
+
str(department)
|
| 500 |
+
for department in (valid_departments or analysis.get("valid_departments", []))
|
| 501 |
+
]
|
| 502 |
+
existing_mapping = _extract_tag_mapping(existing_mapping)
|
| 503 |
+
|
| 504 |
+
department_vectors = _build_department_semantic_vectors(
|
| 505 |
+
valid_departments,
|
| 506 |
+
dept_prototypes=dept_prototypes,
|
| 507 |
+
embed_text_fn=embed_text_fn,
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
rebuilt_mapping = {}
|
| 511 |
+
tag_metadata = {}
|
| 512 |
+
|
| 513 |
+
for tag in valid_tags:
|
| 514 |
+
stats = tag_statistics.get(tag, {})
|
| 515 |
+
counts = Counter(stats.get("department_counts", {}))
|
| 516 |
+
total_examples = int(stats.get("total_examples", 0))
|
| 517 |
+
majority_department = stats.get("majority_department")
|
| 518 |
+
majority_share = float(stats.get("majority_share", 0.0))
|
| 519 |
+
semantic_scores = _department_semantic_scores(
|
| 520 |
+
tag,
|
| 521 |
+
department_vectors,
|
| 522 |
+
embed_text_fn=embed_text_fn,
|
| 523 |
+
)
|
| 524 |
+
semantic_department = (
|
| 525 |
+
max(semantic_scores, key=semantic_scores.get)
|
| 526 |
+
if semantic_scores
|
| 527 |
+
else None
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
if total_examples >= rare_tag_threshold and majority_department is not None:
|
| 531 |
+
primary_department = majority_department
|
| 532 |
+
selection_reason = (
|
| 533 |
+
"majority_distribution"
|
| 534 |
+
if majority_share >= strong_majority_threshold
|
| 535 |
+
else "majority_with_fallback"
|
| 536 |
+
)
|
| 537 |
+
ranked_combined = []
|
| 538 |
+
else:
|
| 539 |
+
combined_scores = {}
|
| 540 |
+
for department in valid_departments:
|
| 541 |
+
semantic_score = float(semantic_scores.get(department, 0.0))
|
| 542 |
+
combined_scores[department] = float(semantic_weight) * semantic_score
|
| 543 |
+
|
| 544 |
+
ranked_combined = sorted(
|
| 545 |
+
combined_scores.items(),
|
| 546 |
+
key=lambda item: item[1],
|
| 547 |
+
reverse=True,
|
| 548 |
+
)
|
| 549 |
+
primary_department = semantic_department or (ranked_combined[0][0] if ranked_combined else None)
|
| 550 |
+
current_department = existing_mapping.get(tag)
|
| 551 |
+
current_semantic_score = float(semantic_scores.get(current_department, 0.0))
|
| 552 |
+
winning_semantic_score = float(semantic_scores.get(primary_department, 0.0))
|
| 553 |
+
if (
|
| 554 |
+
current_department in valid_departments
|
| 555 |
+
and current_semantic_score + float(semantic_override_margin) >= winning_semantic_score
|
| 556 |
+
):
|
| 557 |
+
primary_department = current_department
|
| 558 |
+
if total_examples == 0:
|
| 559 |
+
selection_reason = "semantic_fallback"
|
| 560 |
+
else:
|
| 561 |
+
selection_reason = "rare_tag_semantic"
|
| 562 |
+
|
| 563 |
+
if primary_department is None:
|
| 564 |
+
primary_department = existing_mapping.get(tag)
|
| 565 |
+
selection_reason = "existing_mapping_fallback"
|
| 566 |
+
|
| 567 |
+
if primary_department is None or primary_department not in valid_departments:
|
| 568 |
+
raise ValueError(f"Unable to determine a valid department for tag '{tag}'.")
|
| 569 |
+
|
| 570 |
+
observed_fallback = stats.get("fallback_department")
|
| 571 |
+
combined_fallback = (
|
| 572 |
+
ranked_combined[1][0]
|
| 573 |
+
if len(ranked_combined) > 1
|
| 574 |
+
else None
|
| 575 |
+
)
|
| 576 |
+
fallback_department = observed_fallback or combined_fallback
|
| 577 |
+
if fallback_department == primary_department:
|
| 578 |
+
fallback_department = None
|
| 579 |
+
|
| 580 |
+
rebuilt_mapping[tag] = primary_department
|
| 581 |
+
tag_metadata[tag] = {
|
| 582 |
+
"primary_department": primary_department,
|
| 583 |
+
"fallback_department": fallback_department,
|
| 584 |
+
"current_department": existing_mapping.get(tag),
|
| 585 |
+
"majority_department": majority_department,
|
| 586 |
+
"majority_share": majority_share,
|
| 587 |
+
"total_examples": total_examples,
|
| 588 |
+
"selection_reason": selection_reason,
|
| 589 |
+
"department_counts": dict(counts),
|
| 590 |
+
"semantic_department": semantic_department,
|
| 591 |
+
}
|
| 592 |
+
|
| 593 |
+
policy = build_routing_label_policy(
|
| 594 |
+
valid_tags=valid_tags,
|
| 595 |
+
valid_departments=valid_departments,
|
| 596 |
+
base_mapping=rebuilt_mapping,
|
| 597 |
+
fallback_tag_to_department=existing_mapping,
|
| 598 |
+
default_department=default_department,
|
| 599 |
+
)
|
| 600 |
+
policy.update(
|
| 601 |
+
{
|
| 602 |
+
"tag_metadata": tag_metadata,
|
| 603 |
+
"analysis_summary": {
|
| 604 |
+
"row_count": int(analysis.get("row_count", 0)),
|
| 605 |
+
"used_row_count": int(analysis.get("used_row_count", 0)),
|
| 606 |
+
"missing_mappings": list(analysis.get("missing_mappings", [])),
|
| 607 |
+
"redundant_mappings": list(analysis.get("redundant_mappings", [])),
|
| 608 |
+
"invalid_departments": dict(analysis.get("invalid_departments", {})),
|
| 609 |
+
"unused_tags": list(analysis.get("unused_tags", [])),
|
| 610 |
+
"majority_mismatch_tags": list(analysis.get("majority_mismatch_tags", [])),
|
| 611 |
+
},
|
| 612 |
+
"policy_name": "routing_label_policy",
|
| 613 |
+
}
|
| 614 |
+
)
|
| 615 |
+
return policy
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
def load_routing_label_policy(
|
| 619 |
+
policy_path,
|
| 620 |
+
*,
|
| 621 |
+
fallback_tag_to_department=None,
|
| 622 |
+
valid_tags=None,
|
| 623 |
+
valid_departments=None,
|
| 624 |
+
default_department="Human_Review",
|
| 625 |
+
):
|
| 626 |
+
path = Path(policy_path) if policy_path is not None else None
|
| 627 |
+
loaded_policy = {}
|
| 628 |
+
if path is not None and path.exists():
|
| 629 |
+
loaded_policy = joblib.load(path)
|
| 630 |
+
|
| 631 |
+
base_mapping = _extract_tag_mapping(loaded_policy)
|
| 632 |
+
|
| 633 |
+
valid_tags = (
|
| 634 |
+
[str(tag) for tag in valid_tags]
|
| 635 |
+
if valid_tags is not None
|
| 636 |
+
else list(base_mapping)
|
| 637 |
+
)
|
| 638 |
+
valid_departments = (
|
| 639 |
+
[str(department) for department in valid_departments]
|
| 640 |
+
if valid_departments is not None
|
| 641 |
+
else []
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
policy = build_routing_label_policy(
|
| 645 |
+
valid_tags=valid_tags,
|
| 646 |
+
valid_departments=valid_departments,
|
| 647 |
+
base_mapping=base_mapping,
|
| 648 |
+
fallback_tag_to_department=fallback_tag_to_department,
|
| 649 |
+
default_department=default_department,
|
| 650 |
+
)
|
| 651 |
+
if isinstance(loaded_policy, Mapping):
|
| 652 |
+
for key, value in loaded_policy.items():
|
| 653 |
+
if key == "tag_to_department":
|
| 654 |
+
continue
|
| 655 |
+
policy[key] = value
|
| 656 |
+
return policy
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
def normalize_semantic_similarity(raw_similarity: float) -> float:
|
| 660 |
+
return float(np.clip((float(raw_similarity) + 1.0) / 2.0, 0.0, 1.0))
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
def _to_tag_prob_dict(tag_prob_source, tag_names=None) -> dict[str, float]:
|
| 664 |
+
if isinstance(tag_prob_source, Mapping):
|
| 665 |
+
return {str(tag): float(prob) for tag, prob in tag_prob_source.items()}
|
| 666 |
+
|
| 667 |
+
if tag_names is None:
|
| 668 |
+
raise ValueError("tag_names are required when tag_prob_source is not a mapping.")
|
| 669 |
+
|
| 670 |
+
probs = np.asarray(tag_prob_source, dtype=float).reshape(-1)
|
| 671 |
+
return {
|
| 672 |
+
str(tag_names[idx]): float(probs[idx])
|
| 673 |
+
for idx in range(min(len(tag_names), probs.size))
|
| 674 |
+
}
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
def _department_classifier_confidence(probabilities) -> float:
|
| 678 |
+
if len(probabilities) == 0:
|
| 679 |
+
return 0.0
|
| 680 |
+
probs = np.clip(np.asarray(probabilities, dtype=float), 0.0, 1.0)
|
| 681 |
+
return float(1.0 - np.prod(1.0 - probs))
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
def compute_department_hybrid_scores(
|
| 685 |
+
tag_prob_source,
|
| 686 |
+
embedding,
|
| 687 |
+
dept_prototypes,
|
| 688 |
+
tag_to_department=None,
|
| 689 |
+
*,
|
| 690 |
+
tag_names=None,
|
| 691 |
+
classifier_weight=0.7,
|
| 692 |
+
similarity_weight=0.3,
|
| 693 |
+
top_k=5,
|
| 694 |
+
):
|
| 695 |
+
tag_to_department = tag_to_department or DEFAULT_TAG_TO_DEPARTMENT
|
| 696 |
+
tag_prob_dict = _to_tag_prob_dict(tag_prob_source, tag_names=tag_names)
|
| 697 |
+
sorted_tags = sorted(
|
| 698 |
+
tag_prob_dict.items(),
|
| 699 |
+
key=lambda item: item[1],
|
| 700 |
+
reverse=True,
|
| 701 |
+
)
|
| 702 |
+
if top_k is not None:
|
| 703 |
+
sorted_tags = sorted_tags[:top_k]
|
| 704 |
+
|
| 705 |
+
assert_predicted_tags_mapped(
|
| 706 |
+
[tag for tag, _ in sorted_tags],
|
| 707 |
+
tag_to_department,
|
| 708 |
+
valid_departments=dept_prototypes.keys(),
|
| 709 |
+
)
|
| 710 |
+
|
| 711 |
+
department_prob_lists: dict[str, list[float]] = {}
|
| 712 |
+
top_tag_votes = []
|
| 713 |
+
|
| 714 |
+
for tag, prob in sorted_tags:
|
| 715 |
+
department = tag_to_department.get(tag)
|
| 716 |
+
if department is None:
|
| 717 |
+
continue
|
| 718 |
+
clipped_prob = float(np.clip(prob, 0.0, 1.0))
|
| 719 |
+
department_prob_lists.setdefault(department, []).append(clipped_prob)
|
| 720 |
+
top_tag_votes.append(
|
| 721 |
+
{
|
| 722 |
+
"tag": tag,
|
| 723 |
+
"score": clipped_prob,
|
| 724 |
+
"department": department,
|
| 725 |
+
}
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
details = {}
|
| 729 |
+
candidate_departments = set(dept_prototypes) | set(department_prob_lists)
|
| 730 |
+
if not candidate_departments:
|
| 731 |
+
return None, 0.0, {}, top_tag_votes
|
| 732 |
+
|
| 733 |
+
emb = np.asarray(embedding, dtype=float).reshape(1, -1)
|
| 734 |
+
|
| 735 |
+
for department in candidate_departments:
|
| 736 |
+
classifier_confidence = _department_classifier_confidence(
|
| 737 |
+
department_prob_lists.get(department, [])
|
| 738 |
+
)
|
| 739 |
+
|
| 740 |
+
proto = dept_prototypes.get(department)
|
| 741 |
+
raw_similarity = 0.0
|
| 742 |
+
if proto is not None:
|
| 743 |
+
raw_similarity = float(
|
| 744 |
+
cosine_similarity(
|
| 745 |
+
emb,
|
| 746 |
+
np.asarray(proto, dtype=float).reshape(1, -1),
|
| 747 |
+
)[0][0]
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
semantic_similarity = normalize_semantic_similarity(raw_similarity)
|
| 751 |
+
hybrid_confidence = (
|
| 752 |
+
float(classifier_weight) * classifier_confidence
|
| 753 |
+
+ float(similarity_weight) * semantic_similarity
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
details[department] = {
|
| 757 |
+
"department": department,
|
| 758 |
+
"classifier_confidence": float(classifier_confidence),
|
| 759 |
+
"semantic_similarity": float(semantic_similarity),
|
| 760 |
+
"raw_semantic_similarity": float(raw_similarity),
|
| 761 |
+
"hybrid_confidence": float(hybrid_confidence),
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
best_department = max(
|
| 765 |
+
details,
|
| 766 |
+
key=lambda dept: details[dept]["hybrid_confidence"],
|
| 767 |
+
)
|
| 768 |
+
best_hybrid_confidence = float(details[best_department]["hybrid_confidence"])
|
| 769 |
+
return best_department, best_hybrid_confidence, details, top_tag_votes
|
utils/review_policy_utils.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import joblib
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
DEFAULT_TARGET_REVIEW_FRACTION = 0.15
|
| 10 |
+
DEFAULT_MIN_REVIEW_FRACTION = 0.10
|
| 11 |
+
DEFAULT_FALLBACK_THRESHOLD = 0.55
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def build_default_review_policy(
|
| 15 |
+
*,
|
| 16 |
+
target_review_fraction: float = DEFAULT_TARGET_REVIEW_FRACTION,
|
| 17 |
+
min_review_fraction: float = DEFAULT_MIN_REVIEW_FRACTION,
|
| 18 |
+
fallback_threshold: float = DEFAULT_FALLBACK_THRESHOLD,
|
| 19 |
+
):
|
| 20 |
+
target_review_fraction = float(
|
| 21 |
+
np.clip(
|
| 22 |
+
max(float(target_review_fraction), float(min_review_fraction)),
|
| 23 |
+
0.0,
|
| 24 |
+
1.0,
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
fallback_threshold = float(np.clip(fallback_threshold, 0.0, 1.0))
|
| 28 |
+
return {
|
| 29 |
+
"target_review_fraction": target_review_fraction,
|
| 30 |
+
"min_review_fraction": float(np.clip(min_review_fraction, 0.0, 1.0)),
|
| 31 |
+
"fallback_threshold": fallback_threshold,
|
| 32 |
+
"percentile_threshold": fallback_threshold,
|
| 33 |
+
"effective_threshold": fallback_threshold,
|
| 34 |
+
"percentile_review_fraction": target_review_fraction,
|
| 35 |
+
"effective_review_fraction": target_review_fraction,
|
| 36 |
+
"method": "percentile_plus_fixed_threshold",
|
| 37 |
+
"fit_source": "default",
|
| 38 |
+
"sample_size": 0,
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def select_review_indices_by_percentile(
|
| 43 |
+
hybrid_confidences,
|
| 44 |
+
*,
|
| 45 |
+
target_review_fraction: float = DEFAULT_TARGET_REVIEW_FRACTION,
|
| 46 |
+
min_review_fraction: float = DEFAULT_MIN_REVIEW_FRACTION,
|
| 47 |
+
):
|
| 48 |
+
scores = np.asarray(hybrid_confidences, dtype=float).reshape(-1)
|
| 49 |
+
if scores.size == 0:
|
| 50 |
+
return np.zeros(0, dtype=bool)
|
| 51 |
+
|
| 52 |
+
target_review_fraction = float(
|
| 53 |
+
np.clip(
|
| 54 |
+
max(float(target_review_fraction), float(min_review_fraction)),
|
| 55 |
+
0.0,
|
| 56 |
+
1.0,
|
| 57 |
+
)
|
| 58 |
+
)
|
| 59 |
+
review_count = int(np.ceil(scores.size * target_review_fraction))
|
| 60 |
+
review_count = max(review_count, int(np.ceil(scores.size * float(min_review_fraction))))
|
| 61 |
+
review_count = min(review_count, scores.size)
|
| 62 |
+
|
| 63 |
+
review_mask = np.zeros(scores.size, dtype=bool)
|
| 64 |
+
if review_count == 0:
|
| 65 |
+
return review_mask
|
| 66 |
+
|
| 67 |
+
review_indices = np.argsort(scores, kind="mergesort")[:review_count]
|
| 68 |
+
review_mask[review_indices] = True
|
| 69 |
+
return review_mask
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def fit_review_policy(
|
| 73 |
+
hybrid_confidences,
|
| 74 |
+
*,
|
| 75 |
+
target_review_fraction: float = DEFAULT_TARGET_REVIEW_FRACTION,
|
| 76 |
+
min_review_fraction: float = DEFAULT_MIN_REVIEW_FRACTION,
|
| 77 |
+
fallback_threshold: float = DEFAULT_FALLBACK_THRESHOLD,
|
| 78 |
+
):
|
| 79 |
+
scores = np.asarray(hybrid_confidences, dtype=float).reshape(-1)
|
| 80 |
+
if scores.size == 0:
|
| 81 |
+
return build_default_review_policy(
|
| 82 |
+
target_review_fraction=target_review_fraction,
|
| 83 |
+
min_review_fraction=min_review_fraction,
|
| 84 |
+
fallback_threshold=fallback_threshold,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
review_mask = select_review_indices_by_percentile(
|
| 88 |
+
scores,
|
| 89 |
+
target_review_fraction=target_review_fraction,
|
| 90 |
+
min_review_fraction=min_review_fraction,
|
| 91 |
+
)
|
| 92 |
+
percentile_threshold = float(np.max(scores[review_mask])) if review_mask.any() else float(fallback_threshold)
|
| 93 |
+
fallback_threshold = float(np.clip(fallback_threshold, 0.0, 1.0))
|
| 94 |
+
effective_threshold = float(max(percentile_threshold, fallback_threshold))
|
| 95 |
+
effective_review_mask = (scores <= percentile_threshold) | (scores < fallback_threshold)
|
| 96 |
+
|
| 97 |
+
policy = build_default_review_policy(
|
| 98 |
+
target_review_fraction=target_review_fraction,
|
| 99 |
+
min_review_fraction=min_review_fraction,
|
| 100 |
+
fallback_threshold=fallback_threshold,
|
| 101 |
+
)
|
| 102 |
+
policy.update(
|
| 103 |
+
{
|
| 104 |
+
"percentile_threshold": percentile_threshold,
|
| 105 |
+
"effective_threshold": effective_threshold,
|
| 106 |
+
"percentile_review_fraction": float(review_mask.mean()),
|
| 107 |
+
"effective_review_fraction": float(effective_review_mask.mean()),
|
| 108 |
+
"fit_source": "validation",
|
| 109 |
+
"sample_size": int(scores.size),
|
| 110 |
+
"hybrid_confidence_mean": float(np.mean(scores)),
|
| 111 |
+
"hybrid_confidence_std": float(np.std(scores)),
|
| 112 |
+
"hybrid_confidence_min": float(np.min(scores)),
|
| 113 |
+
"hybrid_confidence_max": float(np.max(scores)),
|
| 114 |
+
}
|
| 115 |
+
)
|
| 116 |
+
return policy
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def load_review_policy(path, default=None):
|
| 120 |
+
path = Path(path)
|
| 121 |
+
if path.exists():
|
| 122 |
+
loaded = joblib.load(path)
|
| 123 |
+
if isinstance(loaded, dict):
|
| 124 |
+
merged = build_default_review_policy()
|
| 125 |
+
merged.update(loaded)
|
| 126 |
+
return merged
|
| 127 |
+
return default or build_default_review_policy()
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def apply_controlled_review(mode, hybrid_confidence, review_policy=None):
|
| 131 |
+
policy = review_policy or build_default_review_policy()
|
| 132 |
+
hybrid_confidence = float(np.clip(hybrid_confidence, 0.0, 1.0))
|
| 133 |
+
|
| 134 |
+
percentile_threshold = float(policy.get("percentile_threshold", DEFAULT_FALLBACK_THRESHOLD))
|
| 135 |
+
fallback_threshold = float(policy.get("fallback_threshold", DEFAULT_FALLBACK_THRESHOLD))
|
| 136 |
+
effective_threshold = float(policy.get("effective_threshold", max(percentile_threshold, fallback_threshold)))
|
| 137 |
+
|
| 138 |
+
percentile_trigger = hybrid_confidence <= percentile_threshold
|
| 139 |
+
fallback_trigger = hybrid_confidence < fallback_threshold
|
| 140 |
+
forced_human_review = mode != "HUMAN_REVIEW" and (percentile_trigger or fallback_trigger)
|
| 141 |
+
|
| 142 |
+
triggered_rules = []
|
| 143 |
+
if percentile_trigger:
|
| 144 |
+
triggered_rules.append("percentile")
|
| 145 |
+
if fallback_trigger:
|
| 146 |
+
triggered_rules.append("fixed_threshold")
|
| 147 |
+
|
| 148 |
+
final_mode = "HUMAN_REVIEW" if forced_human_review else mode
|
| 149 |
+
requires_review = final_mode != "AUTO_ROUTE"
|
| 150 |
+
|
| 151 |
+
if forced_human_review:
|
| 152 |
+
reason = (
|
| 153 |
+
f"Controlled review injection forced HUMAN_REVIEW: hybrid_confidence={hybrid_confidence:.4f}, "
|
| 154 |
+
f"percentile_threshold={percentile_threshold:.4f}, fallback_threshold={fallback_threshold:.4f}."
|
| 155 |
+
)
|
| 156 |
+
elif mode == "HUMAN_REVIEW":
|
| 157 |
+
reason = "Two-stage gate already routed the ticket to HUMAN_REVIEW."
|
| 158 |
+
elif mode == "AUTO_ROUTE_FLAGGED":
|
| 159 |
+
reason = "Ticket remains AUTO_ROUTE_FLAGGED after the controlled review check."
|
| 160 |
+
else:
|
| 161 |
+
reason = "Ticket passed the controlled review check and remains AUTO_ROUTE."
|
| 162 |
+
|
| 163 |
+
decision = {
|
| 164 |
+
"base_mode": mode,
|
| 165 |
+
"final_mode": final_mode,
|
| 166 |
+
"requires_review": requires_review,
|
| 167 |
+
"forced_human_review": forced_human_review,
|
| 168 |
+
"triggered_rules": triggered_rules,
|
| 169 |
+
"hybrid_confidence": hybrid_confidence,
|
| 170 |
+
"percentile_threshold": percentile_threshold,
|
| 171 |
+
"fallback_threshold": fallback_threshold,
|
| 172 |
+
"effective_threshold": effective_threshold,
|
| 173 |
+
"target_review_fraction": float(policy.get("target_review_fraction", DEFAULT_TARGET_REVIEW_FRACTION)),
|
| 174 |
+
"reason": reason,
|
| 175 |
+
}
|
| 176 |
+
return final_mode, requires_review, decision
|
utils/runtime_utils.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
import joblib
|
| 8 |
+
import yaml
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from .hybrid_routing_utils import DEFAULT_TAG_TO_DEPARTMENT
|
| 12 |
+
except ImportError: # pragma: no cover
|
| 13 |
+
from hybrid_routing_utils import DEFAULT_TAG_TO_DEPARTMENT
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
DEFAULT_DUPLICATE_THRESHOLD = 0.7623
|
| 17 |
+
|
| 18 |
+
DEFAULT_ROUTING_CONFIG = {
|
| 19 |
+
"global_threshold": 0.35,
|
| 20 |
+
"confidence_threshold": 0.45,
|
| 21 |
+
"default_department": "Human_Review",
|
| 22 |
+
"departments": dict(DEFAULT_TAG_TO_DEPARTMENT),
|
| 23 |
+
"priority_escalation": {
|
| 24 |
+
"critical": "Escalation",
|
| 25 |
+
"high": None,
|
| 26 |
+
},
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _resolve_base_dir(base_dir: str | Path | None = None) -> Path:
|
| 31 |
+
if base_dir is None:
|
| 32 |
+
return Path(__file__).resolve().parent
|
| 33 |
+
return Path(base_dir).resolve()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def resolve_model_dir(base_dir: str | Path | None = None) -> Path:
|
| 37 |
+
root = _resolve_base_dir(base_dir)
|
| 38 |
+
candidates = [
|
| 39 |
+
root / "Models",
|
| 40 |
+
root.parent / "Models",
|
| 41 |
+
]
|
| 42 |
+
for candidate in candidates:
|
| 43 |
+
if candidate.exists():
|
| 44 |
+
return candidate
|
| 45 |
+
return candidates[0]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def resolve_data_root(base_dir: str | Path | None = None) -> Path:
|
| 49 |
+
root = _resolve_base_dir(base_dir)
|
| 50 |
+
candidates = [
|
| 51 |
+
root / "Datasets",
|
| 52 |
+
root.parent / "Datasets",
|
| 53 |
+
]
|
| 54 |
+
for candidate in candidates:
|
| 55 |
+
if candidate.exists():
|
| 56 |
+
return candidate
|
| 57 |
+
return candidates[0]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def resolve_dataset_file(
|
| 61 |
+
base_dir: str | Path | None,
|
| 62 |
+
filename: str,
|
| 63 |
+
*,
|
| 64 |
+
prefer_processed: bool = True,
|
| 65 |
+
) -> Path:
|
| 66 |
+
data_root = resolve_data_root(base_dir)
|
| 67 |
+
ordered = []
|
| 68 |
+
if prefer_processed:
|
| 69 |
+
ordered.extend(
|
| 70 |
+
[
|
| 71 |
+
data_root / "Processed" / filename,
|
| 72 |
+
data_root / filename,
|
| 73 |
+
]
|
| 74 |
+
)
|
| 75 |
+
else:
|
| 76 |
+
ordered.extend(
|
| 77 |
+
[
|
| 78 |
+
data_root / filename,
|
| 79 |
+
data_root / "Processed" / filename,
|
| 80 |
+
]
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
for candidate in ordered:
|
| 84 |
+
if candidate.exists():
|
| 85 |
+
return candidate
|
| 86 |
+
|
| 87 |
+
raise FileNotFoundError(
|
| 88 |
+
f"Dataset '{filename}' not found in deployment bundle. Checked: {ordered}"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def load_model_config(base_dir: str | Path | None = None) -> dict[str, Any]:
|
| 93 |
+
model_dir = resolve_model_dir(base_dir)
|
| 94 |
+
config_path = model_dir / "model_config.pkl"
|
| 95 |
+
if not config_path.exists():
|
| 96 |
+
return {}
|
| 97 |
+
loaded = joblib.load(config_path)
|
| 98 |
+
return loaded if isinstance(loaded, dict) else {}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def resolve_model_reference(
|
| 102 |
+
model_ref: str | Path | None,
|
| 103 |
+
*,
|
| 104 |
+
base_dir: str | Path | None = None,
|
| 105 |
+
model_dir: str | Path | None = None,
|
| 106 |
+
default: str | None = None,
|
| 107 |
+
) -> str:
|
| 108 |
+
if model_ref in (None, ""):
|
| 109 |
+
if default is None:
|
| 110 |
+
raise FileNotFoundError("No model reference was provided.")
|
| 111 |
+
return str(default)
|
| 112 |
+
|
| 113 |
+
raw_value = str(model_ref)
|
| 114 |
+
raw_path = Path(raw_value)
|
| 115 |
+
base_path = _resolve_base_dir(base_dir)
|
| 116 |
+
model_path_root = Path(model_dir).resolve() if model_dir is not None else resolve_model_dir(base_path)
|
| 117 |
+
|
| 118 |
+
candidates: list[Path] = []
|
| 119 |
+
if raw_path.is_absolute():
|
| 120 |
+
candidates.append(raw_path)
|
| 121 |
+
if "Models" in raw_path.parts:
|
| 122 |
+
model_idx = raw_path.parts.index("Models")
|
| 123 |
+
suffix_parts = raw_path.parts[model_idx + 1 :]
|
| 124 |
+
if suffix_parts:
|
| 125 |
+
candidates.append(model_path_root.joinpath(*suffix_parts))
|
| 126 |
+
candidates.append(model_path_root / raw_path.name)
|
| 127 |
+
candidates.append(base_path / raw_path.name)
|
| 128 |
+
else:
|
| 129 |
+
candidates.extend(
|
| 130 |
+
[
|
| 131 |
+
raw_path,
|
| 132 |
+
base_path / raw_path,
|
| 133 |
+
model_path_root / raw_path,
|
| 134 |
+
model_path_root / raw_path.name,
|
| 135 |
+
base_path / raw_path.name,
|
| 136 |
+
]
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
seen = set()
|
| 140 |
+
for candidate in candidates:
|
| 141 |
+
candidate = candidate.resolve() if candidate.exists() else candidate
|
| 142 |
+
normalized = str(candidate)
|
| 143 |
+
if normalized in seen:
|
| 144 |
+
continue
|
| 145 |
+
seen.add(normalized)
|
| 146 |
+
if candidate.exists():
|
| 147 |
+
return str(candidate)
|
| 148 |
+
|
| 149 |
+
if default is not None:
|
| 150 |
+
return str(default)
|
| 151 |
+
return raw_value
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _merge_routing_config(loaded: dict[str, Any] | None) -> dict[str, Any]:
|
| 155 |
+
merged = {
|
| 156 |
+
"global_threshold": DEFAULT_ROUTING_CONFIG["global_threshold"],
|
| 157 |
+
"confidence_threshold": DEFAULT_ROUTING_CONFIG["confidence_threshold"],
|
| 158 |
+
"default_department": DEFAULT_ROUTING_CONFIG["default_department"],
|
| 159 |
+
"departments": dict(DEFAULT_ROUTING_CONFIG["departments"]),
|
| 160 |
+
"priority_escalation": dict(DEFAULT_ROUTING_CONFIG["priority_escalation"]),
|
| 161 |
+
}
|
| 162 |
+
if not isinstance(loaded, dict):
|
| 163 |
+
return merged
|
| 164 |
+
|
| 165 |
+
for key in ("global_threshold", "confidence_threshold", "default_department"):
|
| 166 |
+
if key in loaded:
|
| 167 |
+
merged[key] = loaded[key]
|
| 168 |
+
|
| 169 |
+
merged["departments"].update(loaded.get("departments") or {})
|
| 170 |
+
merged["priority_escalation"].update(loaded.get("priority_escalation") or {})
|
| 171 |
+
return merged
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def load_routing_config(
|
| 175 |
+
base_dir: str | Path | None = None,
|
| 176 |
+
) -> tuple[dict[str, Any], Path | None]:
|
| 177 |
+
root = _resolve_base_dir(base_dir)
|
| 178 |
+
model_dir = resolve_model_dir(root)
|
| 179 |
+
candidates = [
|
| 180 |
+
root / "config" / "routing_config.yaml",
|
| 181 |
+
model_dir / "routing_config.yaml",
|
| 182 |
+
root.parent / "config" / "routing_config.yaml",
|
| 183 |
+
]
|
| 184 |
+
|
| 185 |
+
for candidate in candidates:
|
| 186 |
+
if not candidate.exists():
|
| 187 |
+
continue
|
| 188 |
+
with candidate.open("r", encoding="utf-8") as handle:
|
| 189 |
+
return _merge_routing_config(yaml.safe_load(handle)), candidate
|
| 190 |
+
|
| 191 |
+
return _merge_routing_config(None), None
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def load_duplicate_threshold(base_dir: str | Path | None = None) -> float:
|
| 195 |
+
model_dir = resolve_model_dir(base_dir)
|
| 196 |
+
threshold_path = model_dir / "duplicate_thresholds.pkl"
|
| 197 |
+
if threshold_path.exists():
|
| 198 |
+
payload = joblib.load(threshold_path)
|
| 199 |
+
if isinstance(payload, dict):
|
| 200 |
+
try:
|
| 201 |
+
return float(payload.get("duplicate_threshold", DEFAULT_DUPLICATE_THRESHOLD))
|
| 202 |
+
except (TypeError, ValueError):
|
| 203 |
+
pass
|
| 204 |
+
return float(DEFAULT_DUPLICATE_THRESHOLD)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def load_metric_artifact(
|
| 208 |
+
base_dir: str | Path | None,
|
| 209 |
+
filename: str,
|
| 210 |
+
) -> dict[str, Any]:
|
| 211 |
+
model_dir = resolve_model_dir(base_dir)
|
| 212 |
+
artifact_path = model_dir / filename
|
| 213 |
+
if not artifact_path.exists():
|
| 214 |
+
raise FileNotFoundError(f"Metric artifact not found: {artifact_path}")
|
| 215 |
+
with artifact_path.open("r", encoding="utf-8") as handle:
|
| 216 |
+
return json.load(handle)
|