|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Main B3clf Script. |
|
""" |
|
|
|
|
|
import os |
|
|
|
import numpy as np |
|
from .descriptor_padel import compute_descriptors |
|
from .geometry_opt import geometry_optimize |
|
from .utils import ( |
|
get_descriptors, |
|
predict_permeability, |
|
scale_descriptors, |
|
select_descriptors, |
|
) |
|
|
|
__all__ = [ |
|
"b3clf", |
|
] |
|
|
|
|
|
def b3clf( |
|
mol_in, |
|
sep="\s+|\t+", |
|
clf="xgb", |
|
sampling="classic_ADASYN", |
|
output="B3clf_output.xlsx", |
|
verbose=1, |
|
random_seed=42, |
|
time_per_mol=-1, |
|
keep_features="no", |
|
keep_sdf="no", |
|
threshold="none", |
|
): |
|
"""Use B3clf for BBB classifications with resampling strategies. |
|
|
|
Parameters |
|
---------- |
|
mol_in : str |
|
Input molecule text fie which can be SMILES strings (file extension with .smi or .csv) or |
|
SDF file format. No space is allowed for molecular name if input is a file with SMILES strings. |
|
sep : str, optional |
|
Separator used to parse data if a text file with SMILES strings is provided. |
|
Default="\s+|\t+" which will take any space and any tab as delimiter. |
|
clf: str, optional |
|
Classification algorithm, which can be "dtree" for decision trees, "knn" for kNN, "logreg" |
|
for logistical regression and "xgb" for XGBoost. Default="xgb". |
|
sampling : str, optional |
|
Sampling strategies that can be used which includes "common", |
|
"RandUndersampling", "SMOTE", "borderline_SMOTE", "kmeans_SMOTE" and "classic_ADASYN". The |
|
"common" denotes that no resampling strategy is employed. Default="classic_ADASYN". |
|
output : str, optional |
|
Output file name for the predicted results consisting molecule ID, predicted probability |
|
and labels for BBB permeability. |
|
verbose : int, optional |
|
When verbose is zero, no results are printed out. Otherwise, the program prints the |
|
predictions. Default=1. |
|
random_seed : int, optional |
|
Random seed for reproducibility. Default=42. |
|
time_per_mol : int, optional |
|
Time limit for each molecule in seconds. Default=-1, which means no time limit. |
|
keep_features : str, optional |
|
To keep intermediate molecular feature file, "yes" or "no". Default="no". |
|
keep_sdf : str, optional |
|
To keep intermediate molecular geometry file with 3D coordinates, "yes" or "no". |
|
Default="no". |
|
threshold : str, optional |
|
To set the threshold for the predicted probability which can be "none". "J_threshold" and |
|
"F_threshold". "J_threshold" will use threshold optimized from Youden’s J statistic. |
|
"F_threshold" will use threshold optimized from F score. Default="none". |
|
|
|
Returns |
|
------- |
|
result_df : pandas.DataFrame |
|
Result of BBB predictions with molecule ID/name, predicted probability and predicted labels. |
|
|
|
""" |
|
|
|
|
|
if random_seed is not None: |
|
rng = np.random.default_rng(random_seed) |
|
|
|
mol_tag = os.path.basename(mol_in).split(".")[0] |
|
|
|
features_out = f"{mol_tag}_padel_descriptors.xlsx" |
|
internal_sdf = f"{mol_tag}_optimized_3d.sdf" |
|
|
|
|
|
|
|
|
|
|
|
geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep) |
|
|
|
_ = compute_descriptors( |
|
sdf_file=internal_sdf, |
|
excel_out=features_out, |
|
output_csv=None, |
|
timeout=None, |
|
time_per_molecule=time_per_mol, |
|
) |
|
|
|
|
|
X_features, info_df = get_descriptors(df=features_out) |
|
|
|
|
|
|
|
X_features = select_descriptors(df=X_features) |
|
|
|
|
|
X_features = scale_descriptors(df=X_features) |
|
|
|
|
|
|
|
|
|
|
|
result_df = predict_permeability( |
|
clf_str=clf, |
|
sampling_str=sampling, |
|
mol_features=X_features, |
|
info_df=info_df, |
|
threshold=threshold, |
|
) |
|
|
|
|
|
display_cols = [ |
|
"ID", |
|
"SMILES", |
|
"B3clf_predicted_probability", |
|
"B3clf_predicted_label", |
|
] |
|
|
|
result_df = result_df[ |
|
[col for col in result_df.columns.to_list() if col in display_cols] |
|
] |
|
if verbose != 0: |
|
print(result_df) |
|
|
|
result_df.to_excel(output, index=None, engine="openpyxl") |
|
|
|
if keep_features != "yes": |
|
os.remove(features_out) |
|
if keep_sdf != "yes": |
|
os.remove(internal_sdf) |
|
|
|
return result_df |
|
|