curation_machine_learning_models / cascaded_classifier.py

Upload cascaded_classifier.py

f47b72d verified 19 days ago

4.86 kB

	from __future__ import annotations
	from typing import Iterable, Union
	from numpy import ndarray
	from pandas import DataFrame
	from sklearn.pipeline import Pipeline
	import numpy as np
	import sys
	import skops.io as sio
	from huggingface_hub import hf_hub_download
	import pandas as pd

	PREDICTOR_SPLIT_TARGET = 0 # Represents 'not noise'.

	class CascadedClassifier(Pipeline):
	def __init__(self, steps: list[tuple[str, Pipeline]], memory=None):
	"""
	Initializes a cascaded classifier pipeline with two classification steps.

	Parameters
	----------
	steps: list[tuple[str, Pipeline]]
	A list of (name, pipeline) tuples for noise and SUA classifiers.
	memory: optional
	Used to cache the fitted transformers of the pipeline.
	"""
	super().__init__(steps, memory=memory)
	assert len(steps) == 2, 'CascadedClassifier must have exactly 2 steps'
	self._steps = steps

	@property
	def feature_names_in_(self) -> list[str]:
	"""
	Returns the feature names used in the noise classifier.

	Returns
	-------
	list[str]
	The input feature names.
	"""
	return self.named_steps["noise"][0].feature_names_in_

	def predict(self, X: list[str] \| ndarray \| Iterable \| DataFrame, **predict_params) -> ndarray:
	"""
	Predicts labels for the input data using a cascading approach.

	Parameters
	----------
	X: list[str] \| ndarray \| Iterable \| DataFrame
	The input data.

	predict_params: dict
	Parameters for the predict method.

	Returns
	-------
	ndarray
	The predicted labels.
	"""
	# Step 1: Get initial predictions from the noise classifier.
	y = self.named_steps["noise"][0].predict(X)

	# Identify rows where the prediction is 'not noise'.
	predict_rows = (y == PREDICTOR_SPLIT_TARGET)
	X_predict = X[predict_rows]

	# If no rows require further classification, return the initial predictions.
	if len(X_predict) == 0:
	return y

	# Step 2: Get predictions from the SUA classifier for the 'not noise' subset.
	y2 = self.named_steps["sua"][0].predict(X_predict)

	# Shift the SUA/MUA labels to avoid overlap with noise labels.
	y2 += 2 # Assuming noise is labeled as 0 or 1.

	# Update the initial predictions with the SUA classifier results.
	y[predict_rows] = y2

	return y

	def predict_proba(
	self,
	X: Union[list[str], ndarray, Iterable, pd.DataFrame],
	) -> ndarray:
	"""
	Predict the probabilities for the input data and normalize them so the sum is 1.

	Parameters
	----------
	X : Union[list[str], ndarray, Iterable, pd.DataFrame]
	The input data.
	predict_params : dict
	Parameters for the predict method.

	Returns
	-------
	ndarray
	The normalized predicted probabilities for noise, SUA, and MUA.
	Shape: (n_samples, 3)

	Notes
	-----
	The output probabilities are ordered as [SUA, noise, MUA].
	All rows sum to 1 after normalization.
	"""
	if len(X) == 0:
	return np.array([], dtype=np.float64).reshape(0, 3)

	# Initialize probabilities array with zeros
	n_samples = len(X)
	out_proba = np.zeros((n_samples, 3), dtype=np.float64)

	try:
	# Get noise classifier probabilities
	y_proba_noise = self.named_steps["noise"][0].predict_proba(X)
	# Get SUA vs MUA probabilities
	y_proba_sua = self.named_steps["sua"][0].predict_proba(X)

	for i in range(n_samples):
	if y_proba_noise[i, 0] > y_proba_noise[i, 1]: # neural > noise
	out_proba[i, 0] = 0 # noise, there is no noise
	out_proba[i, 1] = y_proba_sua[i, 0] # MUA
	out_proba[i, 2] = y_proba_sua[i, 1] # SUA
	else: # noise >= neural
	out_proba[i, 0] = y_proba_noise[i, 1] # noise
	out_proba[i, 1] = y_proba_noise[i, 0] # MUA (neural probability)
	out_proba[i, 2] = 0 # SUA (no SUA)

	return out_proba

	except Exception as e:
	raise RuntimeError(
	f"Error during probability prediction: {str(e)}"
	) from e