from __future__ import annotations from typing import Iterable, Union from numpy import ndarray from pandas import DataFrame from sklearn.pipeline import Pipeline import numpy as np import sys import skops.io as sio from huggingface_hub import hf_hub_download import pandas as pd PREDICTOR_SPLIT_TARGET = 0 # Represents 'not noise'. class CascadedClassifier(Pipeline): def __init__(self, steps: list[tuple[str, Pipeline]], memory=None): """ Initializes a cascaded classifier pipeline with two classification steps. Parameters ---------- steps: list[tuple[str, Pipeline]] A list of (name, pipeline) tuples for noise and SUA classifiers. memory: optional Used to cache the fitted transformers of the pipeline. """ super().__init__(steps, memory=memory) assert len(steps) == 2, 'CascadedClassifier must have exactly 2 steps' self._steps = steps @property def feature_names_in_(self) -> list[str]: """ Returns the feature names used in the noise classifier. Returns ------- list[str] The input feature names. """ return self.named_steps["noise"][0].feature_names_in_ def predict(self, X: list[str] | ndarray | Iterable | DataFrame, **predict_params) -> ndarray: """ Predicts labels for the input data using a cascading approach. Parameters ---------- X: list[str] | ndarray | Iterable | DataFrame The input data. predict_params: dict Parameters for the predict method. Returns ------- ndarray The predicted labels. """ # Step 1: Get initial predictions from the noise classifier. y = self.named_steps["noise"][0].predict(X) # Identify rows where the prediction is 'not noise'. predict_rows = (y == PREDICTOR_SPLIT_TARGET) X_predict = X[predict_rows] # If no rows require further classification, return the initial predictions. if len(X_predict) == 0: return y # Step 2: Get predictions from the SUA classifier for the 'not noise' subset. y2 = self.named_steps["sua"][0].predict(X_predict) # Shift the SUA/MUA labels to avoid overlap with noise labels. y2 += 2 # Assuming noise is labeled as 0 or 1. # Update the initial predictions with the SUA classifier results. y[predict_rows] = y2 return y def predict_proba( self, X: Union[list[str], ndarray, Iterable, pd.DataFrame], ) -> ndarray: """ Predict the probabilities for the input data and normalize them so the sum is 1. Parameters ---------- X : Union[list[str], ndarray, Iterable, pd.DataFrame] The input data. predict_params : dict Parameters for the predict method. Returns ------- ndarray The normalized predicted probabilities for noise, SUA, and MUA. Shape: (n_samples, 3) Notes ----- The output probabilities are ordered as [SUA, noise, MUA]. All rows sum to 1 after normalization. """ if len(X) == 0: return np.array([], dtype=np.float64).reshape(0, 3) # Initialize probabilities array with zeros n_samples = len(X) out_proba = np.zeros((n_samples, 3), dtype=np.float64) try: # Get noise classifier probabilities y_proba_noise = self.named_steps["noise"][0].predict_proba(X) # Get SUA vs MUA probabilities y_proba_sua = self.named_steps["sua"][0].predict_proba(X) for i in range(n_samples): if y_proba_noise[i, 0] > y_proba_noise[i, 1]: # neural > noise out_proba[i, 0] = 0 # noise, there is no noise out_proba[i, 1] = y_proba_sua[i, 0] # MUA out_proba[i, 2] = y_proba_sua[i, 1] # SUA else: # noise >= neural out_proba[i, 0] = y_proba_noise[i, 1] # noise out_proba[i, 1] = y_proba_noise[i, 0] # MUA (neural probability) out_proba[i, 2] = 0 # SUA (no SUA) return out_proba except Exception as e: raise RuntimeError( f"Error during probability prediction: {str(e)}" ) from e