|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""B3clf utility functions.""" |
|
|
|
import os |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from joblib import load |
|
|
|
__all__ = [ |
|
"get_descriptors", |
|
"select_descriptors", |
|
"scale_descriptors", |
|
"get_clf", |
|
"predict_permeability", |
|
] |
|
|
|
|
|
def get_descriptors(df): |
|
"""Create features dataframe and information dataframe from provided path.""" |
|
if type(df) == str: |
|
if df.lower().endswith(".sdf"): |
|
df = pd.read_sdf(df) |
|
elif df.lower().endswith(".xlsx"): |
|
df = pd.read_excel(df, engine="openpyxl") |
|
elif df.lower().endswith(".csv"): |
|
df = pd.read_csv(df) |
|
else: |
|
raise ValueError( |
|
"Command-line tool only supports feature files in .XLSX format" |
|
) |
|
|
|
info_list = ["compoud_name", "SMILES", "cid", "category", "inchi", "Energy"] |
|
|
|
|
|
df.replace([np.inf, -np.inf], np.nan, inplace=True) |
|
df.dropna(axis=0, inplace=True) |
|
|
|
features_cols = [col for col in df.columns.to_list() if col not in info_list] |
|
X = df[features_cols] |
|
info_cols = [col for col in df.columns.to_list() if col in info_list] |
|
if len(info_cols) != 0: |
|
info = df[info_cols] |
|
else: |
|
info = pd.DataFrame(index=df.index) |
|
|
|
return X, info |
|
|
|
|
|
def select_descriptors(df): |
|
"""Select certain Padel descriptors, which are those taken by B3clf models.""" |
|
dirname = os.path.dirname(__file__) |
|
with open(os.path.join(dirname, "feature_list.txt")) as f: |
|
selected_list = f.read().splitlines() |
|
|
|
df_selected = df[[col for col in df.columns.to_list() if col in selected_list]] |
|
|
|
return df_selected |
|
|
|
|
|
def scale_descriptors(df): |
|
"""Scale input features using B3DB Standard Scaler. |
|
|
|
The b3db_scaler was fitted using the full B3DB dataset. |
|
""" |
|
|
|
dirname = os.path.dirname(__file__) |
|
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib") |
|
b3db_scaler = load(filename) |
|
df_new = b3db_scaler.transform(df) |
|
|
|
return df_new |
|
|
|
|
|
def get_clf(clf_str, sampling_str): |
|
"""Get b3clf fitted classifier""" |
|
clf_list = ["dtree", "knn", "logreg", "xgb"] |
|
sampling_list = [ |
|
"borderline_SMOTE", |
|
"classic_ADASYN", |
|
"classic_RandUndersampling", |
|
"classic_SMOTE", |
|
"kmeans_SMOTE", |
|
"common", |
|
] |
|
|
|
|
|
if clf_str not in clf_list: |
|
raise ValueError("Input classifier is not supported; got {}".format(clf_str)) |
|
elif sampling_str not in sampling_list: |
|
raise ValueError( |
|
"Input sampling method is not supported; got {}".format(sampling_str) |
|
) |
|
|
|
dirname = os.path.dirname(__file__) |
|
|
|
clf_path = os.path.join( |
|
dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str) |
|
) |
|
|
|
clf = load(clf_path) |
|
|
|
return clf |
|
|
|
|
|
def predict_permeability( |
|
clf_str, sampling_str, mol_features, info_df, threshold="none" |
|
): |
|
"""Compute and store BBB predicted label and predicted probability to results dataframe.""" |
|
|
|
|
|
dirname = os.path.dirname(__file__) |
|
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx") |
|
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl") |
|
|
|
label_pool = np.zeros(mol_features.shape[0], dtype=int) |
|
|
|
|
|
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str) |
|
|
|
if type(mol_features) == pd.DataFrame: |
|
if mol_features.index.tolist() != info_df.index.tolist(): |
|
raise ValueError( |
|
"Features_df and Info_df do not have the same index. Internal processing error" |
|
) |
|
|
|
|
|
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[ |
|
:, 1 |
|
] |
|
|
|
mask = np.greater_equal( |
|
info_df["B3clf_predicted_probability"].to_numpy(), |
|
|
|
df_thres.loc["xgb-classic_ADASYN", threshold], |
|
) |
|
label_pool[mask] = 1 |
|
|
|
info_df["B3clf_predicted_label"] = label_pool |
|
|
|
|
|
info_df.reset_index(inplace=True) |
|
|
|
return info_df |
|
|