B3clf / b3clf /utils.py
legend1234's picture
Synced repo using 'sync_with_huggingface' Github Action
0e95800
# -*- coding: utf-8 -*-
# The B3clf library computes the blood-brain barrier (BBB) permeability
# of organic molecules with resampling strategies.
#
# Copyright (C) 2021 The Ayers Lab
#
# This file is part of B3clf.
#
# B3clf is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# B3clf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --
"""B3clf utility functions."""
import os
import numpy as np
import pandas as pd
from joblib import load
__all__ = [
"get_descriptors",
"select_descriptors",
"scale_descriptors",
"get_clf",
"predict_permeability",
]
def get_descriptors(df):
"""Create features dataframe and information dataframe from provided path."""
if type(df) == str:
if df.lower().endswith(".sdf"):
df = pd.read_sdf(df)
elif df.lower().endswith(".xlsx"):
df = pd.read_excel(df, engine="openpyxl")
elif df.lower().endswith(".csv"):
df = pd.read_csv(df)
else:
raise ValueError(
"Command-line tool only supports feature files in .XLSX format"
)
info_list = ["compoud_name", "SMILES", "cid", "category", "inchi", "Energy"]
# drop infinity and NaN values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=0, inplace=True)
features_cols = [col for col in df.columns.to_list() if col not in info_list]
X = df[features_cols]
info_cols = [col for col in df.columns.to_list() if col in info_list]
if len(info_cols) != 0:
info = df[info_cols]
else:
info = pd.DataFrame(index=df.index)
return X, info
def select_descriptors(df):
"""Select certain Padel descriptors, which are those taken by B3clf models."""
dirname = os.path.dirname(__file__)
with open(os.path.join(dirname, "feature_list.txt")) as f:
selected_list = f.read().splitlines()
df_selected = df[[col for col in df.columns.to_list() if col in selected_list]]
return df_selected
def scale_descriptors(df):
"""Scale input features using B3DB Standard Scaler.
The b3db_scaler was fitted using the full B3DB dataset.
"""
dirname = os.path.dirname(__file__)
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
b3db_scaler = load(filename)
df_new = b3db_scaler.transform(df)
return df_new
def get_clf(clf_str, sampling_str):
"""Get b3clf fitted classifier"""
clf_list = ["dtree", "knn", "logreg", "xgb"]
sampling_list = [
"borderline_SMOTE",
"classic_ADASYN",
"classic_RandUndersampling",
"classic_SMOTE",
"kmeans_SMOTE",
"common",
]
# This could be moved to an initial check method for input parameters
if clf_str not in clf_list:
raise ValueError("Input classifier is not supported; got {}".format(clf_str))
elif sampling_str not in sampling_list:
raise ValueError(
"Input sampling method is not supported; got {}".format(sampling_str)
)
dirname = os.path.dirname(__file__)
# Move data to new storage place for packaging
clf_path = os.path.join(
dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str)
)
clf = load(clf_path)
return clf
def predict_permeability(
clf_str, sampling_str, mol_features, info_df, threshold="none"
):
"""Compute and store BBB predicted label and predicted probability to results dataframe."""
# load the threshold data
dirname = os.path.dirname(__file__)
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
# default threshold is 0.5
label_pool = np.zeros(mol_features.shape[0], dtype=int)
# get the classifier
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
if type(mol_features) == pd.DataFrame:
if mol_features.index.tolist() != info_df.index.tolist():
raise ValueError(
"Features_df and Info_df do not have the same index. Internal processing error"
)
# get predicted probabilities
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
:, 1
]
# get predicted label from probability using the threshold
mask = np.greater_equal(
info_df["B3clf_predicted_probability"].to_numpy(),
# df_thres.loc[clf_str + "-" + sampling_str, threshold])
df_thres.loc["xgb-classic_ADASYN", threshold],
)
label_pool[mask] = 1
# save the predicted labels
info_df["B3clf_predicted_label"] = label_pool
# info_df["B3clf_predicted_label"] = info_df["B3clf_predicted_label"].astype("int64")
info_df.reset_index(inplace=True)
return info_df