File size: 5,469 Bytes
d05f89f 0e95800 d05f89f 0e95800 d05f89f 0e95800 d05f89f 0e95800 d05f89f 0e95800 d05f89f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# -*- coding: utf-8 -*-
# The B3clf library computes the blood-brain barrier (BBB) permeability
# of organic molecules with resampling strategies.
#
# Copyright (C) 2021 The Ayers Lab
#
# This file is part of B3clf.
#
# B3clf is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# B3clf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --
"""
Main B3clf Script.
"""
# Todo: Enable b3clf prediction without PaDeL calculation from PaDeL descriptor input
import os
import numpy as np
from .descriptor_padel import compute_descriptors
from .geometry_opt import geometry_optimize
from .utils import (
get_descriptors,
predict_permeability,
scale_descriptors,
select_descriptors,
)
__all__ = [
"b3clf",
]
def b3clf(
mol_in,
sep="\s+|\t+",
clf="xgb",
sampling="classic_ADASYN",
output="B3clf_output.xlsx",
verbose=1,
random_seed=42,
time_per_mol=-1,
keep_features="no",
keep_sdf="no",
threshold="none",
):
"""Use B3clf for BBB classifications with resampling strategies.
Parameters
----------
mol_in : str
Input molecule text fie which can be SMILES strings (file extension with .smi or .csv) or
SDF file format. No space is allowed for molecular name if input is a file with SMILES strings.
sep : str, optional
Separator used to parse data if a text file with SMILES strings is provided.
Default="\s+|\t+" which will take any space and any tab as delimiter.
clf: str, optional
Classification algorithm, which can be "dtree" for decision trees, "knn" for kNN, "logreg"
for logistical regression and "xgb" for XGBoost. Default="xgb".
sampling : str, optional
Sampling strategies that can be used which includes "common",
"RandUndersampling", "SMOTE", "borderline_SMOTE", "kmeans_SMOTE" and "classic_ADASYN". The
"common" denotes that no resampling strategy is employed. Default="classic_ADASYN".
output : str, optional
Output file name for the predicted results consisting molecule ID, predicted probability
and labels for BBB permeability.
verbose : int, optional
When verbose is zero, no results are printed out. Otherwise, the program prints the
predictions. Default=1.
random_seed : int, optional
Random seed for reproducibility. Default=42.
time_per_mol : int, optional
Time limit for each molecule in seconds. Default=-1, which means no time limit.
keep_features : str, optional
To keep intermediate molecular feature file, "yes" or "no". Default="no".
keep_sdf : str, optional
To keep intermediate molecular geometry file with 3D coordinates, "yes" or "no".
Default="no".
threshold : str, optional
To set the threshold for the predicted probability which can be "none". "J_threshold" and
"F_threshold". "J_threshold" will use threshold optimized from Youden’s J statistic.
"F_threshold" will use threshold optimized from F score. Default="none".
Returns
-------
result_df : pandas.DataFrame
Result of BBB predictions with molecule ID/name, predicted probability and predicted labels.
"""
# set random seed
if random_seed is not None:
rng = np.random.default_rng(random_seed)
mol_tag = os.path.basename(mol_in).split(".")[0]
features_out = f"{mol_tag}_padel_descriptors.xlsx"
internal_sdf = f"{mol_tag}_optimized_3d.sdf"
# Geometry optimization
# Input:
# * Either an SDF file with molecular geometries or a text file with SMILES strings
geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
_ = compute_descriptors(
sdf_file=internal_sdf,
excel_out=features_out,
output_csv=None,
timeout=None,
time_per_molecule=time_per_mol,
)
# Get computed descriptors
X_features, info_df = get_descriptors(df=features_out)
# X_features, info_df = get_descriptors(internal_df)
# Select descriptors
X_features = select_descriptors(df=X_features)
# Scale descriptors
X_features = scale_descriptors(df=X_features)
# Get classifier
# clf = get_clf(clf_str=clf, sampling_str=sampling)
# Get classifier
result_df = predict_permeability(
clf_str=clf,
sampling_str=sampling,
mol_features=X_features,
info_df=info_df,
threshold=threshold,
)
# Get classifier
display_cols = [
"ID",
"SMILES",
"B3clf_predicted_probability",
"B3clf_predicted_label",
]
result_df = result_df[
[col for col in result_df.columns.to_list() if col in display_cols]
]
if verbose != 0:
print(result_df)
result_df.to_excel(output, index=None, engine="openpyxl")
if keep_features != "yes":
os.remove(features_out)
if keep_sdf != "yes":
os.remove(internal_sdf)
return result_df
|