SignalMod / src /models /baseline.py
JonnyBP
backup stable api and model service before pipeline testing
6cda091
"""
src/models/baseline.py
Modelos clΓ‘sicos de ML para clasificaciΓ³n de texto.
TraducciΓ³n directa de notebooks 04 y 05.
Todos los modelos siguen la misma interfaz:
model.fit(X_train, y_train)
model.predict(X)
model.predict_proba(X)
model.save(path)
Model.load(path)
Uso desde el pipeline:
model = build_model("lr", config_path="configs/models.yaml")
model.fit(X_train_vec, y_train)
preds = model.predict(X_test_vec)
"""
import yaml
import joblib
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_validate
from src.utils.logger import get_logger
logger = get_logger(__name__)
# ── Clase base ────────────────────────────────────────────────────────────────
class BaseSklearnModel:
"""
Interfaz comΓΊn para todos los modelos sklearn del proyecto.
Hereda LRModel y EnsembleModel.
"""
def __init__(self):
self.pipeline = None # sklearn Pipeline (TF-IDF + clf)
self.is_fitted = False
def fit(self, X_train, y_train) -> "BaseSklearnModel":
"""Entrena el pipeline completo."""
logger.info(f"Entrenando {self.__class__.__name__}...")
self.pipeline.fit(X_train, y_train)
self.is_fitted = True
logger.info(" Entrenamiento completado")
return self
def predict(self, X) -> np.ndarray:
self._check_fitted()
return self.pipeline.predict(X)
def predict_proba(self, X) -> np.ndarray:
self._check_fitted()
return self.pipeline.predict_proba(X)
def cross_validate(self, X_train, y_train, cv_folds: int = 5, rand: int = 42) -> dict:
"""
EvaluaciΓ³n con StratifiedKFold.
Devuelve medias y desviaciones estΓ‘ndar de las mΓ©tricas.
"""
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=rand)
results = cross_validate(
self.pipeline, X_train, y_train,
cv=cv,
scoring={"f1": "f1_weighted", "roc_auc": "roc_auc"},
return_train_score=True,
n_jobs=-1,
)
summary = {
"cv_f1_mean" : results["test_f1"].mean(),
"cv_f1_std" : results["test_f1"].std(),
"cv_roc_mean" : results["test_roc_auc"].mean(),
"train_f1_mean" : results["train_f1"].mean(),
"gap_pp" : (results["train_f1"].mean() - results["test_f1"].mean()) * 100,
}
logger.info(
f" CV F1: {summary['cv_f1_mean']:.4f} Β± {summary['cv_f1_std']:.4f} | "
f"Gap: {summary['gap_pp']:.1f}pp"
)
return summary
def save(self, path: str | Path) -> None:
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(self.pipeline, path)
logger.info(f"Modelo guardado: {path}")
@classmethod
def load(cls, path: str | Path) -> "BaseSklearnModel":
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Modelo no encontrado: {path}")
instance = cls.__new__(cls)
instance.pipeline = joblib.load(path)
instance.is_fitted = True
logger.info(f"Modelo cargado: {path}")
return instance
def _check_fitted(self):
if not self.is_fitted:
raise RuntimeError("El modelo no estΓ‘ entrenado. Llama a .fit() primero.")
# ── Logistic Regression ────────────────────────────────────────────────────────
class LRModel(BaseSklearnModel):
"""
Logistic Regression + TF-IDF.
Mejor modelo del proyecto (notebook 06):
F1 test = 0.7579 | CV-test gap = 4.76pp
ParΓ‘metros optimizados con Optuna sobre configs/best_params.yaml.
"""
def __init__(
self,
config_path: str = "configs/models.yaml",
feat_config_path: str = "configs/features.yaml",
best_params_path: str = "configs/best_params.yaml",
):
super().__init__()
# Intentar cargar best_params.yaml (resultado de Optuna)
try:
import yaml as _yaml
with open(best_params_path) as f:
best = _yaml.safe_load(f)
bp = best.get("hyperparameters", {})
logger.info("ParΓ‘metros cargados desde best_params.yaml")
except FileNotFoundError:
bp = {}
logger.warning("best_params.yaml no encontrado β€” usando config por defecto")
# Config base
with open(config_path) as f:
mod_cfg = yaml.safe_load(f)["models"]["logistic_regression"]
with open(feat_config_path) as f:
vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
# Prioridad: best_params > yaml config
ngram_str = str(bp.get("ngram_range", "1_2"))
ngram = (1, 1) if ngram_str == "1_1" else (1, 2)
self.pipeline = Pipeline([
("tfidf", TfidfVectorizer(
max_features = bp.get("max_features", vec_cfg["max_features"]),
ngram_range = ngram,
sublinear_tf = bp.get("sublinear_tf", vec_cfg["sublinear_tf"]),
min_df = bp.get("min_df", vec_cfg["min_df"]),
analyzer = "word",
strip_accents = "unicode",
)),
("clf", LogisticRegression(
C = bp.get("C", mod_cfg["C"]),
max_iter = mod_cfg["max_iter"],
class_weight = mod_cfg["class_weight"],
solver = mod_cfg["solver"],
random_state = 42,
)),
])
logger.info(f"LRModel creado β€” C={bp.get('C', mod_cfg['C']):.4f} | ngram={ngram}")
# ── Random Forest ──────────────────────────────────────────────────────────────
class RFModel(BaseSklearnModel):
"""
Random Forest + TF-IDF.
ParΓ‘metros desde configs/models.yaml.
"""
def __init__(
self,
config_path: str = "configs/models.yaml",
feat_config_path: str = "configs/features.yaml",
):
super().__init__()
with open(config_path) as f:
rf_cfg = yaml.safe_load(f)["models"]["random_forest"]
with open(feat_config_path) as f:
vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
self.pipeline = Pipeline([
("tfidf", TfidfVectorizer(
max_features = vec_cfg["max_features"],
ngram_range = (1, 1), # RF + bigramas es muy lento
sublinear_tf = vec_cfg["sublinear_tf"],
min_df = vec_cfg["min_df"],
analyzer = "word",
strip_accents = "unicode",
)),
("clf", RandomForestClassifier(
n_estimators = rf_cfg["n_estimators"],
max_depth = rf_cfg.get("max_depth", 8),
min_samples_leaf = rf_cfg.get("min_samples_leaf", 4),
max_features = "sqrt",
class_weight = rf_cfg["class_weight"],
random_state = 42,
n_jobs = -1,
)),
])
logger.info("RFModel creado")
# ── XGBoost ───────────────────────────────────────────────────────────────────
class XGBModel(BaseSklearnModel):
"""
XGBoost + TF-IDF.
Requiere: pip install xgboost
"""
def __init__(
self,
config_path: str = "configs/models.yaml",
feat_config_path: str = "configs/features.yaml",
):
super().__init__()
try:
from xgboost import XGBClassifier
except ImportError:
raise ImportError("Instala XGBoost: pip install xgboost")
with open(config_path) as f:
xgb_cfg = yaml.safe_load(f)["models"]["xgboost"]
with open(feat_config_path) as f:
vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
self.pipeline = Pipeline([
("tfidf", TfidfVectorizer(
max_features = vec_cfg["max_features"],
ngram_range = (1, 1),
sublinear_tf = True,
min_df = vec_cfg["min_df"],
analyzer = "word",
strip_accents = "unicode",
)),
("clf", XGBClassifier(
n_estimators = xgb_cfg.get("n_estimators", 200),
max_depth = xgb_cfg.get("max_depth", 3),
learning_rate = xgb_cfg.get("learning_rate", 0.05),
subsample = xgb_cfg.get("subsample", 0.8),
colsample_bytree = xgb_cfg.get("colsample_bytree", 0.8),
use_label_encoder= False,
eval_metric = "logloss",
random_state = 42,
verbosity = 0,
)),
])
logger.info("XGBModel creado")
# ── Factory ───────────────────────────────────────────────────────────────────
def build_model(
model_type: str,
config_path: str = "configs/models.yaml",
feat_config_path: str = "configs/features.yaml",
best_params_path: str = "configs/best_params.yaml",
) -> BaseSklearnModel:
"""
Construye el modelo indicado en la configuraciΓ³n.
Args:
model_type: "lr" | "rf" | "xgboost"
Returns:
Instancia del modelo listo para .fit()
"""
builders = {
"lr" : lambda: LRModel(config_path, feat_config_path, best_params_path),
"rf" : lambda: RFModel(config_path, feat_config_path),
"xgboost": lambda: XGBModel(config_path, feat_config_path),
}
if model_type not in builders:
raise ValueError(f"model_type debe ser uno de: {list(builders.keys())}")
logger.info(f"Construyendo modelo: {model_type}")
return builders[model_type]()