File size: 5,285 Bytes
d05f89f 0e95800 d05f89f 0e95800 d05f89f 0e95800 d05f89f 0e95800 d05f89f 0e95800 d05f89f 0e95800 d05f89f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# -*- coding: utf-8 -*-
# The B3clf library computes the blood-brain barrier (BBB) permeability
# of organic molecules with resampling strategies.
#
# Copyright (C) 2021 The Ayers Lab
#
# This file is part of B3clf.
#
# B3clf is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# B3clf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --
"""B3clf utility functions."""
import os
import numpy as np
import pandas as pd
from joblib import load
__all__ = [
"get_descriptors",
"select_descriptors",
"scale_descriptors",
"get_clf",
"predict_permeability",
]
def get_descriptors(df):
"""Create features dataframe and information dataframe from provided path."""
if type(df) == str:
if df.lower().endswith(".sdf"):
df = pd.read_sdf(df)
elif df.lower().endswith(".xlsx"):
df = pd.read_excel(df, engine="openpyxl")
elif df.lower().endswith(".csv"):
df = pd.read_csv(df)
else:
raise ValueError(
"Command-line tool only supports feature files in .XLSX format"
)
info_list = ["compoud_name", "SMILES", "cid", "category", "inchi", "Energy"]
# drop infinity and NaN values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=0, inplace=True)
features_cols = [col for col in df.columns.to_list() if col not in info_list]
X = df[features_cols]
info_cols = [col for col in df.columns.to_list() if col in info_list]
if len(info_cols) != 0:
info = df[info_cols]
else:
info = pd.DataFrame(index=df.index)
return X, info
def select_descriptors(df):
"""Select certain Padel descriptors, which are those taken by B3clf models."""
dirname = os.path.dirname(__file__)
with open(os.path.join(dirname, "feature_list.txt")) as f:
selected_list = f.read().splitlines()
df_selected = df[[col for col in df.columns.to_list() if col in selected_list]]
return df_selected
def scale_descriptors(df):
"""Scale input features using B3DB Standard Scaler.
The b3db_scaler was fitted using the full B3DB dataset.
"""
dirname = os.path.dirname(__file__)
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
b3db_scaler = load(filename)
df_new = b3db_scaler.transform(df)
return df_new
def get_clf(clf_str, sampling_str):
"""Get b3clf fitted classifier"""
clf_list = ["dtree", "knn", "logreg", "xgb"]
sampling_list = [
"borderline_SMOTE",
"classic_ADASYN",
"classic_RandUndersampling",
"classic_SMOTE",
"kmeans_SMOTE",
"common",
]
# This could be moved to an initial check method for input parameters
if clf_str not in clf_list:
raise ValueError("Input classifier is not supported; got {}".format(clf_str))
elif sampling_str not in sampling_list:
raise ValueError(
"Input sampling method is not supported; got {}".format(sampling_str)
)
dirname = os.path.dirname(__file__)
# Move data to new storage place for packaging
clf_path = os.path.join(
dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str)
)
clf = load(clf_path)
return clf
def predict_permeability(
clf_str, sampling_str, mol_features, info_df, threshold="none"
):
"""Compute and store BBB predicted label and predicted probability to results dataframe."""
# load the threshold data
dirname = os.path.dirname(__file__)
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
# default threshold is 0.5
label_pool = np.zeros(mol_features.shape[0], dtype=int)
# get the classifier
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
if type(mol_features) == pd.DataFrame:
if mol_features.index.tolist() != info_df.index.tolist():
raise ValueError(
"Features_df and Info_df do not have the same index. Internal processing error"
)
# get predicted probabilities
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
:, 1
]
# get predicted label from probability using the threshold
mask = np.greater_equal(
info_df["B3clf_predicted_probability"].to_numpy(),
# df_thres.loc[clf_str + "-" + sampling_str, threshold])
df_thres.loc["xgb-classic_ADASYN", threshold],
)
label_pool[mask] = 1
# save the predicted labels
info_df["B3clf_predicted_label"] = label_pool
# info_df["B3clf_predicted_label"] = info_df["B3clf_predicted_label"].astype("int64")
info_df.reset_index(inplace=True)
return info_df
|