Spaces:

devrup404
/

SignalMod

Running

SignalMod / src /models /hybrid_ensemble.py

Mirae Kang

feat: implement new models and improve UI, #23

46cc63a 8 days ago

9.24 kB

	"""
	Hybrid ensemble: regularized DistilBERT probabilities + TF-IDF logistic regression.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path

	import numpy as np
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import f1_score, roc_auc_score

	from src.evaluation.threshold_tuning import predict_with_threshold, search_best_threshold
	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import TfidfVectorizer

	from src.utils.logger import get_logger

	logger = get_logger(__name__)


	class StableLRModel:
	"""Regularized LR on TF-IDF (stable_training.yaml)."""

	def __init__(self, lr_cfg: dict, tfidf_cfg: dict, *, C: float \| None = None):
	ngram = tuple(tfidf_cfg.get("ngram_range", [1, 2]))
	self.pipeline = Pipeline(
	[
	(
	"tfidf",
	TfidfVectorizer(
	max_features=int(tfidf_cfg.get("max_features", 5000)),
	ngram_range=ngram,
	sublinear_tf=bool(tfidf_cfg.get("sublinear_tf", True)),
	min_df=int(tfidf_cfg.get("min_df", 3)),
	analyzer="word",
	strip_accents="unicode",
	),
	),
	(
	"clf",
	LogisticRegression(
	C=float(C if C is not None else lr_cfg.get("C", 0.05)),
	max_iter=int(lr_cfg.get("max_iter", 2000)),
	class_weight=lr_cfg.get("class_weight", "balanced"),
	solver=lr_cfg.get("solver", "lbfgs"),
	random_state=42,
	),
	),
	]
	)
	self.is_fitted = False

	def fit(self, X_train, y_train):
	logger.info(f"Training stable LR — C={self.pipeline.named_steps['clf'].C}")
	self.pipeline.fit(X_train, y_train)
	self.is_fitted = True
	return self

	@property
	def C(self) -> float:
	return float(self.pipeline.named_steps["clf"].C)

	def set_C(self, c: float) -> None:
	self.pipeline.named_steps["clf"].C = float(c)

	def train_test_gap(self, X_train, y_train, X_test, y_test) -> tuple[float, float, float]:
	"""Return (f1_train, f1_test, gap) using weighted F1."""
	preds_train = self.predict(X_train)
	preds_test = self.predict(X_test)
	y_tr = np.asarray(y_train).astype(int)
	y_te = np.asarray(y_test).astype(int)
	f1_train = float(f1_score(y_tr, preds_train, average="weighted", zero_division=0))
	f1_test = float(f1_score(y_te, preds_test, average="weighted", zero_division=0))
	return f1_train, f1_test, abs(f1_train - f1_test)

	def predict(self, X):
	return self.pipeline.predict(X)

	def predict_proba(self, X) -> np.ndarray:
	return self.pipeline.predict_proba(X)

	def save(self, path: str \| Path) -> None:
	import joblib

	path = Path(path)
	path.parent.mkdir(parents=True, exist_ok=True)
	joblib.dump(self.pipeline, path)
	logger.info(f"Stable LR saved: {path}")

	@classmethod
	def load(cls, path: str \| Path) -> "StableLRModel":
	import joblib

	inst = cls.__new__(cls)
	inst.pipeline = joblib.load(path)
	inst.is_fitted = True
	return inst


	def fit_lr_with_gap_control(
	X_train,
	y_train,
	X_test,
	y_test,
	lr_cfg: dict,
	tfidf_cfg: dict,
	*,
	max_gap: float = 0.05,
	X_train_gap=None,
	y_train_gap=None,
	) -> tuple[StableLRModel, dict]:
	"""
	Fit LR on augmented train; tune regularization until \|train F1 - test F1\| < max_gap.
	"""
	gap_cfg = lr_cfg.get("gap_search", {})
	X_gap = X_train_gap if X_train_gap is not None else X_train
	y_gap = y_train_gap if y_train_gap is not None else y_train

	if not gap_cfg.get("enabled", True):
	grid = [{"C": float(lr_cfg.get("C", 0.05)), **tfidf_cfg}]
	else:
	grid = gap_cfg.get("param_grid") or [
	{"C": c, **tfidf_cfg} for c in gap_cfg.get("C_candidates", [lr_cfg.get("C", 0.05)])
	]

	best: StableLRModel \| None = None
	best_meta: dict = {}
	best_gap = float("inf")

	for params in grid:
	merged_tfidf = {tfidf_cfg, {k: v for k, v in params.items() if k != "C"}}
	c = float(params.get("C", lr_cfg.get("C", 0.05)))
	model = StableLRModel(lr_cfg, merged_tfidf, C=c)
	model.fit(X_train, y_train)
	f1_train, f1_test, gap = model.train_test_gap(X_gap, y_gap, X_test, y_test)
	logger.info(
	f"LR gap search — C={c} max_features={merged_tfidf.get('max_features')} "
	f"min_df={merged_tfidf.get('min_df')} train_f1={f1_train:.4f} "
	f"test_f1={f1_test:.4f} gap={gap:.4f}"
	)
	meta = {
	"C": c,
	"max_features": int(merged_tfidf.get("max_features", 800)),
	"min_df": int(merged_tfidf.get("min_df", 3)),
	"f1_train": round(f1_train, 4),
	"f1_test": round(f1_test, 4),
	"train_test_gap": round(gap, 4),
	"train_test_gap_pp": round(gap * 100, 2),
	"gap_ok": gap < max_gap,
	}
	if gap < best_gap:
	best, best_meta = model, meta
	best_gap = gap
	if gap < max_gap:
	logger.info(f"LR gap OK at C={c}")
	break

	if not best_meta.get("gap_ok"):
	logger.warning(
	f"LR gap still {best_meta['train_test_gap']:.4f} after grid search; "
	f"using best gap C={best_meta['C']}"
	)

	return best, best_meta # type: ignore[return-value]


	def soft_vote_probs(
	prob_a: np.ndarray,
	prob_b: np.ndarray,
	weight_a: float = 0.5,
	weight_b: float = 0.5,
	) -> np.ndarray:
	total = weight_a + weight_b
	return (weight_a * prob_a + weight_b * prob_b) / total


	def evaluate_ensemble(
	bert_probs: np.ndarray,
	lr_probs: np.ndarray,
	y_true: np.ndarray,
	*,
	bert_weight: float = 0.5,
	lr_weight: float = 0.5,
	model_name: str = "Hybrid-ensemble",
	threshold: float = 0.5,
	) -> dict:
	"""Combine probabilities and compute binary metrics."""
	combined = soft_vote_probs(bert_probs, lr_probs, bert_weight, lr_weight)
	preds = predict_with_threshold(combined, threshold)
	y = np.asarray(y_true).astype(int)

	f1_test = float(f1_score(y, preds, average="weighted", zero_division=0))
	f1_toxic = float(f1_score(y, preds, pos_label=1, zero_division=0))

	return {
	"model": model_name,
	"threshold": round(threshold, 4),
	"f1_weighted": round(f1_test, 4),
	"f1_toxic": round(f1_toxic, 4),
	"roc_auc": round(float(roc_auc_score(y, combined)), 4),
	"fp": int(((y == 0) & (preds == 1)).sum()),
	"fn": int(((y == 1) & (preds == 0)).sum()),
	"ensemble_probs": combined,
	"ensemble_preds": preds,
	}


	def tune_ensemble_threshold(
	bert_probs: np.ndarray,
	lr_probs: np.ndarray,
	y_val: np.ndarray,
	*,
	bert_weight: float = 0.5,
	lr_weight: float = 0.5,
	metric: str = "f1_toxic",
	min_threshold: float = 0.05,
	max_threshold: float = 0.95,
	step: float = 0.01,
	) -> tuple[float, float]:
	"""Search ensemble threshold on validation soft-voted probabilities."""
	combined = soft_vote_probs(bert_probs, lr_probs, bert_weight, lr_weight)
	return search_best_threshold(
	y_val,
	combined,
	metric=metric,
	min_threshold=min_threshold,
	max_threshold=max_threshold,
	step=step,
	)


	def compute_performance_weights(
	bert_probs_val: np.ndarray,
	lr_probs_val: np.ndarray,
	y_val: np.ndarray,
	*,
	bert_threshold: float = 0.33,
	lr_threshold: float = 0.5,
	metric: str = "f1_weighted",
	min_lr_weight: float = 0.15,
	max_lr_weight: float = 0.45,
	) -> tuple[float, float, dict]:
	"""
	Set soft-vote weights proportional to validation F1 (per branch threshold).
	"""
	y = np.asarray(y_val).astype(int)
	bert_preds = predict_with_threshold(bert_probs_val, bert_threshold)
	lr_preds = predict_with_threshold(lr_probs_val, lr_threshold)

	if metric == "f1_toxic":
	bert_score = float(f1_score(y, bert_preds, pos_label=1, zero_division=0))
	lr_score = float(f1_score(y, lr_preds, pos_label=1, zero_division=0))
	else:
	bert_score = float(f1_score(y, bert_preds, average="weighted", zero_division=0))
	lr_score = float(f1_score(y, lr_preds, average="weighted", zero_division=0))

	total = bert_score + lr_score
	if total <= 0:
	bw, lw = 0.7, 0.3
	else:
	lw = lr_score / total
	lw = float(np.clip(lw, min_lr_weight, max_lr_weight))
	bw = 1.0 - lw

	return bw, lw, {
	"bert_val_score": round(bert_score, 4),
	"lr_val_score": round(lr_score, 4),
	"bert_weight": round(bw, 4),
	"lr_weight": round(lw, 4),
	"weight_metric": metric,
	}


	def save_ensemble_meta(path: Path, meta: dict) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	with open(path, "w") as f:
	json.dump(meta, f, indent=2)