File size: 5,285 Bytes
d05f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e95800
d05f89f
0e95800
d05f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e95800
 
 
d05f89f
 
 
 
 
 
 
0e95800
d05f89f
 
 
 
0e95800
 
 
 
 
d05f89f
 
0e95800
 
 
d05f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# -*- coding: utf-8 -*-
# The B3clf library computes the blood-brain barrier (BBB) permeability
# of organic molecules with resampling strategies.
#
# Copyright (C) 2021 The Ayers Lab
#
# This file is part of B3clf.
#
# B3clf is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# B3clf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --

"""B3clf utility functions."""

import os

import numpy as np
import pandas as pd
from joblib import load

__all__ = [
    "get_descriptors",
    "select_descriptors",
    "scale_descriptors",
    "get_clf",
    "predict_permeability",
]


def get_descriptors(df):
    """Create features dataframe and information dataframe from provided path."""
    if type(df) == str:
        if df.lower().endswith(".sdf"):
            df = pd.read_sdf(df)
        elif df.lower().endswith(".xlsx"):
            df = pd.read_excel(df, engine="openpyxl")
        elif df.lower().endswith(".csv"):
            df = pd.read_csv(df)
        else:
            raise ValueError(
                "Command-line tool only supports feature files in .XLSX format"
            )

    info_list = ["compoud_name", "SMILES", "cid", "category", "inchi", "Energy"]

    # drop infinity and NaN values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(axis=0, inplace=True)

    features_cols = [col for col in df.columns.to_list() if col not in info_list]
    X = df[features_cols]
    info_cols = [col for col in df.columns.to_list() if col in info_list]
    if len(info_cols) != 0:
        info = df[info_cols]
    else:
        info = pd.DataFrame(index=df.index)

    return X, info


def select_descriptors(df):
    """Select certain Padel descriptors, which are those taken by B3clf models."""
    dirname = os.path.dirname(__file__)
    with open(os.path.join(dirname, "feature_list.txt")) as f:
        selected_list = f.read().splitlines()

    df_selected = df[[col for col in df.columns.to_list() if col in selected_list]]

    return df_selected


def scale_descriptors(df):
    """Scale input features using B3DB Standard Scaler.

    The b3db_scaler was fitted using the full B3DB dataset.
    """

    dirname = os.path.dirname(__file__)
    filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
    b3db_scaler = load(filename)
    df_new = b3db_scaler.transform(df)

    return df_new


def get_clf(clf_str, sampling_str):
    """Get b3clf fitted classifier"""
    clf_list = ["dtree", "knn", "logreg", "xgb"]
    sampling_list = [
        "borderline_SMOTE",
        "classic_ADASYN",
        "classic_RandUndersampling",
        "classic_SMOTE",
        "kmeans_SMOTE",
        "common",
    ]

    # This could be moved to an initial check method for input parameters
    if clf_str not in clf_list:
        raise ValueError("Input classifier is not supported; got {}".format(clf_str))
    elif sampling_str not in sampling_list:
        raise ValueError(
            "Input sampling method is not supported; got {}".format(sampling_str)
        )

    dirname = os.path.dirname(__file__)
    # Move data to new storage place for packaging
    clf_path = os.path.join(
        dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str)
    )

    clf = load(clf_path)

    return clf


def predict_permeability(
    clf_str, sampling_str, mol_features, info_df, threshold="none"
):
    """Compute and store BBB predicted label and predicted probability to results dataframe."""

    # load the threshold data
    dirname = os.path.dirname(__file__)
    fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
    df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
    # default threshold is 0.5
    label_pool = np.zeros(mol_features.shape[0], dtype=int)

    # get the classifier
    clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)

    if type(mol_features) == pd.DataFrame:
        if mol_features.index.tolist() != info_df.index.tolist():
            raise ValueError(
                "Features_df and Info_df do not have the same index. Internal processing error"
            )

    # get predicted probabilities
    info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
        :, 1
    ]
    # get predicted label from probability using the threshold
    mask = np.greater_equal(
        info_df["B3clf_predicted_probability"].to_numpy(),
        # df_thres.loc[clf_str + "-" + sampling_str, threshold])
        df_thres.loc["xgb-classic_ADASYN", threshold],
    )
    label_pool[mask] = 1
    # save the predicted labels
    info_df["B3clf_predicted_label"] = label_pool

    # info_df["B3clf_predicted_label"] = info_df["B3clf_predicted_label"].astype("int64")
    info_df.reset_index(inplace=True)

    return info_df