|
from __future__ import annotations
|
|
from typing import Iterable, Union
|
|
from numpy import ndarray
|
|
from pandas import DataFrame
|
|
from sklearn.pipeline import Pipeline
|
|
import numpy as np
|
|
import sys
|
|
import skops.io as sio
|
|
from huggingface_hub import hf_hub_download
|
|
import pandas as pd
|
|
|
|
PREDICTOR_SPLIT_TARGET = 0
|
|
|
|
class CascadedClassifier(Pipeline):
|
|
def __init__(self, steps: list[tuple[str, Pipeline]], memory=None):
|
|
"""
|
|
Initializes a cascaded classifier pipeline with two classification steps.
|
|
|
|
Parameters
|
|
----------
|
|
steps: list[tuple[str, Pipeline]]
|
|
A list of (name, pipeline) tuples for noise and SUA classifiers.
|
|
memory: optional
|
|
Used to cache the fitted transformers of the pipeline.
|
|
"""
|
|
super().__init__(steps, memory=memory)
|
|
assert len(steps) == 2, 'CascadedClassifier must have exactly 2 steps'
|
|
self._steps = steps
|
|
|
|
@property
|
|
def feature_names_in_(self) -> list[str]:
|
|
"""
|
|
Returns the feature names used in the noise classifier.
|
|
|
|
Returns
|
|
-------
|
|
list[str]
|
|
The input feature names.
|
|
"""
|
|
return self.named_steps["noise"][0].feature_names_in_
|
|
|
|
def predict(self, X: list[str] | ndarray | Iterable | DataFrame, **predict_params) -> ndarray:
|
|
"""
|
|
Predicts labels for the input data using a cascading approach.
|
|
|
|
Parameters
|
|
----------
|
|
X: list[str] | ndarray | Iterable | DataFrame
|
|
The input data.
|
|
|
|
predict_params: dict
|
|
Parameters for the predict method.
|
|
|
|
Returns
|
|
-------
|
|
ndarray
|
|
The predicted labels.
|
|
"""
|
|
|
|
y = self.named_steps["noise"][0].predict(X)
|
|
|
|
|
|
predict_rows = (y == PREDICTOR_SPLIT_TARGET)
|
|
X_predict = X[predict_rows]
|
|
|
|
|
|
if len(X_predict) == 0:
|
|
return y
|
|
|
|
|
|
y2 = self.named_steps["sua"][0].predict(X_predict)
|
|
|
|
|
|
y2 += 2
|
|
|
|
|
|
y[predict_rows] = y2
|
|
|
|
return y
|
|
|
|
def predict_proba(
|
|
self,
|
|
X: Union[list[str], ndarray, Iterable, pd.DataFrame],
|
|
) -> ndarray:
|
|
"""
|
|
Predict the probabilities for the input data and normalize them so the sum is 1.
|
|
|
|
Parameters
|
|
----------
|
|
X : Union[list[str], ndarray, Iterable, pd.DataFrame]
|
|
The input data.
|
|
predict_params : dict
|
|
Parameters for the predict method.
|
|
|
|
Returns
|
|
-------
|
|
ndarray
|
|
The normalized predicted probabilities for noise, SUA, and MUA.
|
|
Shape: (n_samples, 3)
|
|
|
|
Notes
|
|
-----
|
|
The output probabilities are ordered as [SUA, noise, MUA].
|
|
All rows sum to 1 after normalization.
|
|
"""
|
|
if len(X) == 0:
|
|
return np.array([], dtype=np.float64).reshape(0, 3)
|
|
|
|
|
|
n_samples = len(X)
|
|
out_proba = np.zeros((n_samples, 3), dtype=np.float64)
|
|
|
|
try:
|
|
|
|
y_proba_noise = self.named_steps["noise"][0].predict_proba(X)
|
|
|
|
y_proba_sua = self.named_steps["sua"][0].predict_proba(X)
|
|
|
|
for i in range(n_samples):
|
|
if y_proba_noise[i, 0] > y_proba_noise[i, 1]:
|
|
out_proba[i, 0] = 0
|
|
out_proba[i, 1] = y_proba_sua[i, 0]
|
|
out_proba[i, 2] = y_proba_sua[i, 1]
|
|
else:
|
|
out_proba[i, 0] = y_proba_noise[i, 1]
|
|
out_proba[i, 1] = y_proba_noise[i, 0]
|
|
out_proba[i, 2] = 0
|
|
|
|
return out_proba
|
|
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"Error during probability prediction: {str(e)}"
|
|
) from e
|
|
|
|
|