libokj's picture
Upload 110 files
c0ec7e6
raw
history blame
3.75 kB
from numbers import Number
from typing import Optional, Union
import numpy as np
from deepscreen.utils import get_logger
log = get_logger(__name__)
MOLARITY_TO_POTENCY = {
'p': lambda x: x,
'M': lambda x: -np.log10(x),
'mM': lambda x: -np.log10(x) + 3,
'μM': lambda x: -np.log10(x) + 6,
'uM': lambda x: -np.log10(x) + 6, # in case someone doesn't know how to type micromolar lol
'nM': lambda x: -np.log10(x) + 9,
'pM': lambda x: -np.log10(x) + 12,
'fM': lambda x: -np.log10(x) + 15,
}
# TODO rewrite for swifter.apply
def molar_to_p(labels, units):
assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}."
unit_converted_labels = []
for label, unit in (labels, units):
unit_converted_labels.append(MOLARITY_TO_POTENCY[unit](label))
labels = np.array(unit_converted_labels)
return labels
def label_discretize(labels, thresholds):
# if isinstance(threshold, Number):
# labels = np.where(labels < threshold, 1, 0)
# else:
# labels = np.where(labels < threshold[0], 1, np.where(labels > threshold[1], 0, np.nan))
if isinstance(thresholds, Number):
labels = 1 - np.digitize(labels, [thresholds])
else:
labels = np.digitize(labels, np.sort(thresholds)[::-1])
return labels
def label_transform(
labels,
units: Optional[list[str]],
thresholds: Optional[Union[float, list[Number]]],
discard_intermediate: Optional[bool]
):
f"""Convert labels of all units to p scale (-log10[M]) and binarize them if specified.
:param labels: a sequence of labels, continuous or binary values
:type labels: array_like
:param units: a sequence of label units in {', '.join(MOLARITY_TO_POTENCY)}
:type units: array_like, optional
:param thresholds: discretization threshold(s) for affinity labels, in p scale (-log10[M]).
A single number maps affinities below it to 1 and otherwise to 0.
A tuple of two or more thresholds maps affinities to multiple discrete levels descendingly, assigning values
values below the lowest threshold to the highest level (e.g. 2) and values above the greatest threshold to 0
:type thresholds: list, float, optional
:param discard_intermediate: whether to discard the intermediate (indeterminate) level if provided an odd
number of thresholds (>=3)
:type discard_intermediate: bool
:return: a numpy array of affinity labels in p scale (-log10[M]) or discrete labels
"""
# # Check if labels are already discrete (ignoring NAs).
# discrete = labels.dropna().isin([0, 1]).all()
#
# if discrete:
# assert discretize, "Cannot train a regression model with discrete labels."
# if thresholds:
# warn("Ignoring 'threshold' because 'Y' (labels) in the data table is already binary.")
# if units:
# warn("Ignoring 'units' because 'Y' (labels) in the data table is already binary.")
# labels = labels
if units:
labels = molar_to_p(labels, units)
if thresholds:
labels = label_discretize(labels, thresholds)
if discard_intermediate:
assert len(thresholds) % 2 == 1 and len(thresholds) >= 3, \
"Must give an odd number of (at least 3) thresholds to discard the intermediate level."
intermediate_level = len(thresholds) // 2
# Make the intermediate-level labels NaN (which will be filtered out later)
labels[labels == intermediate_level] = np.nan
# Reduce all levels above the intermediate level by 1
labels[labels > intermediate_level] -= 1
return labels