File size: 5,469 Bytes
d05f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e95800
 
 
 
 
 
d05f89f
 
 
 
 
 
0e95800
 
 
 
 
 
 
 
 
 
 
 
 
d05f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e95800
 
 
 
 
 
 
d05f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e95800
 
 
 
 
 
 
d05f89f
 
0e95800
 
 
 
 
 
 
 
 
 
d05f89f
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
# The B3clf library computes the blood-brain barrier (BBB) permeability
# of organic molecules with resampling strategies.
#
# Copyright (C) 2021 The Ayers Lab
#
# This file is part of B3clf.
#
# B3clf is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# B3clf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --

"""
Main B3clf Script.
"""

# Todo: Enable b3clf prediction without PaDeL calculation from PaDeL descriptor input
import os

import numpy as np
from .descriptor_padel import compute_descriptors
from .geometry_opt import geometry_optimize
from .utils import (
    get_descriptors,
    predict_permeability,
    scale_descriptors,
    select_descriptors,
)

__all__ = [
    "b3clf",
]


def b3clf(
    mol_in,
    sep="\s+|\t+",
    clf="xgb",
    sampling="classic_ADASYN",
    output="B3clf_output.xlsx",
    verbose=1,
    random_seed=42,
    time_per_mol=-1,
    keep_features="no",
    keep_sdf="no",
    threshold="none",
):
    """Use B3clf for BBB classifications with resampling strategies.

    Parameters
    ----------
    mol_in : str
        Input molecule text fie which can be SMILES strings (file extension with .smi or .csv) or
        SDF file format. No space is allowed for molecular name if input is a file with SMILES strings.
    sep : str, optional
        Separator used to parse data if a text file with SMILES strings is provided.
        Default="\s+|\t+" which will take any space and any tab as delimiter.
    clf: str, optional
        Classification algorithm, which can be "dtree" for decision trees, "knn" for kNN, "logreg"
        for logistical regression and "xgb" for XGBoost. Default="xgb".
    sampling : str, optional
        Sampling strategies that can be used which includes "common",
        "RandUndersampling", "SMOTE", "borderline_SMOTE", "kmeans_SMOTE" and "classic_ADASYN". The
        "common" denotes that no resampling strategy is employed. Default="classic_ADASYN".
    output : str, optional
        Output file name for the predicted results consisting molecule ID, predicted probability
        and labels for BBB permeability.
    verbose : int, optional
        When verbose is zero, no results are printed out. Otherwise, the program prints the
        predictions. Default=1.
    random_seed : int, optional
        Random seed for reproducibility. Default=42.
    time_per_mol : int, optional
        Time limit for each molecule in seconds. Default=-1, which means no time limit.
    keep_features : str, optional
        To keep intermediate molecular feature file, "yes" or "no". Default="no".
    keep_sdf : str, optional
        To keep intermediate molecular geometry file with 3D coordinates, "yes" or "no".
        Default="no".
    threshold : str, optional
        To set the threshold for the predicted probability which can be "none". "J_threshold" and
        "F_threshold". "J_threshold" will use threshold optimized from Youden’s J statistic.
        "F_threshold" will use threshold optimized from F score. Default="none".

    Returns
    -------
    result_df : pandas.DataFrame
        Result of BBB predictions with molecule ID/name, predicted probability and predicted labels.

    """

    # set random seed
    if random_seed is not None:
        rng = np.random.default_rng(random_seed)

    mol_tag = os.path.basename(mol_in).split(".")[0]

    features_out = f"{mol_tag}_padel_descriptors.xlsx"
    internal_sdf = f"{mol_tag}_optimized_3d.sdf"

    # Geometry optimization
    # Input:
    # * Either an SDF file with molecular geometries or a text file with SMILES strings

    geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)

    _ = compute_descriptors(
        sdf_file=internal_sdf,
        excel_out=features_out,
        output_csv=None,
        timeout=None,
        time_per_molecule=time_per_mol,
    )

    # Get computed descriptors
    X_features, info_df = get_descriptors(df=features_out)
    # X_features, info_df = get_descriptors(internal_df)

    # Select descriptors
    X_features = select_descriptors(df=X_features)

    # Scale descriptors
    X_features = scale_descriptors(df=X_features)

    # Get classifier
    # clf = get_clf(clf_str=clf, sampling_str=sampling)

    # Get classifier
    result_df = predict_permeability(
        clf_str=clf,
        sampling_str=sampling,
        mol_features=X_features,
        info_df=info_df,
        threshold=threshold,
    )

    # Get classifier
    display_cols = [
        "ID",
        "SMILES",
        "B3clf_predicted_probability",
        "B3clf_predicted_label",
    ]

    result_df = result_df[
        [col for col in result_df.columns.to_list() if col in display_cols]
    ]
    if verbose != 0:
        print(result_df)

    result_df.to_excel(output, index=None, engine="openpyxl")

    if keep_features != "yes":
        os.remove(features_out)
    if keep_sdf != "yes":
        os.remove(internal_sdf)

    return result_df