curation_machine_learning_models / cascaded_classifier.py
AnoushkaJain3's picture
Upload cascaded_classifier.py
f47b72d verified
raw
history blame
4.86 kB
from __future__ import annotations
from typing import Iterable, Union
from numpy import ndarray
from pandas import DataFrame
from sklearn.pipeline import Pipeline
import numpy as np
import sys
import skops.io as sio
from huggingface_hub import hf_hub_download
import pandas as pd
PREDICTOR_SPLIT_TARGET = 0 # Represents 'not noise'.
class CascadedClassifier(Pipeline):
def __init__(self, steps: list[tuple[str, Pipeline]], memory=None):
"""
Initializes a cascaded classifier pipeline with two classification steps.
Parameters
----------
steps: list[tuple[str, Pipeline]]
A list of (name, pipeline) tuples for noise and SUA classifiers.
memory: optional
Used to cache the fitted transformers of the pipeline.
"""
super().__init__(steps, memory=memory)
assert len(steps) == 2, 'CascadedClassifier must have exactly 2 steps'
self._steps = steps
@property
def feature_names_in_(self) -> list[str]:
"""
Returns the feature names used in the noise classifier.
Returns
-------
list[str]
The input feature names.
"""
return self.named_steps["noise"][0].feature_names_in_
def predict(self, X: list[str] | ndarray | Iterable | DataFrame, **predict_params) -> ndarray:
"""
Predicts labels for the input data using a cascading approach.
Parameters
----------
X: list[str] | ndarray | Iterable | DataFrame
The input data.
predict_params: dict
Parameters for the predict method.
Returns
-------
ndarray
The predicted labels.
"""
# Step 1: Get initial predictions from the noise classifier.
y = self.named_steps["noise"][0].predict(X)
# Identify rows where the prediction is 'not noise'.
predict_rows = (y == PREDICTOR_SPLIT_TARGET)
X_predict = X[predict_rows]
# If no rows require further classification, return the initial predictions.
if len(X_predict) == 0:
return y
# Step 2: Get predictions from the SUA classifier for the 'not noise' subset.
y2 = self.named_steps["sua"][0].predict(X_predict)
# Shift the SUA/MUA labels to avoid overlap with noise labels.
y2 += 2 # Assuming noise is labeled as 0 or 1.
# Update the initial predictions with the SUA classifier results.
y[predict_rows] = y2
return y
def predict_proba(
self,
X: Union[list[str], ndarray, Iterable, pd.DataFrame],
) -> ndarray:
"""
Predict the probabilities for the input data and normalize them so the sum is 1.
Parameters
----------
X : Union[list[str], ndarray, Iterable, pd.DataFrame]
The input data.
predict_params : dict
Parameters for the predict method.
Returns
-------
ndarray
The normalized predicted probabilities for noise, SUA, and MUA.
Shape: (n_samples, 3)
Notes
-----
The output probabilities are ordered as [SUA, noise, MUA].
All rows sum to 1 after normalization.
"""
if len(X) == 0:
return np.array([], dtype=np.float64).reshape(0, 3)
# Initialize probabilities array with zeros
n_samples = len(X)
out_proba = np.zeros((n_samples, 3), dtype=np.float64)
try:
# Get noise classifier probabilities
y_proba_noise = self.named_steps["noise"][0].predict_proba(X)
# Get SUA vs MUA probabilities
y_proba_sua = self.named_steps["sua"][0].predict_proba(X)
for i in range(n_samples):
if y_proba_noise[i, 0] > y_proba_noise[i, 1]: # neural > noise
out_proba[i, 0] = 0 # noise, there is no noise
out_proba[i, 1] = y_proba_sua[i, 0] # MUA
out_proba[i, 2] = y_proba_sua[i, 1] # SUA
else: # noise >= neural
out_proba[i, 0] = y_proba_noise[i, 1] # noise
out_proba[i, 1] = y_proba_noise[i, 0] # MUA (neural probability)
out_proba[i, 2] = 0 # SUA (no SUA)
return out_proba
except Exception as e:
raise RuntimeError(
f"Error during probability prediction: {str(e)}"
) from e