File size: 9,134 Bytes
52b8e70
 
 
 
 
 
 
5bd9791
e1092cb
52b8e70
 
 
5bd9791
 
 
 
 
 
52b8e70
 
e1092cb
 
ed190ed
e1092cb
 
 
 
 
 
 
 
52b8e70
 
e1092cb
 
52b8e70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bd9791
 
 
 
 
 
 
 
 
 
 
 
 
 
52b8e70
 
 
5bd9791
52b8e70
 
 
 
 
 
 
 
 
 
5bd9791
52b8e70
 
 
 
 
 
5bd9791
52b8e70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed190ed
 
52b8e70
 
 
ed190ed
 
5bd9791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52b8e70
ed190ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bd9791
ed190ed
 
 
 
 
5bd9791
ed190ed
 
 
 
 
52b8e70
 
 
 
 
 
 
 
e1092cb
52b8e70
5bd9791
 
 
 
 
 
 
 
52b8e70
 
 
 
ed190ed
 
 
 
 
 
 
 
52b8e70
 
 
ed190ed
52b8e70
 
ed190ed
52b8e70
ed190ed
52b8e70
 
e1092cb
 
 
52b8e70
ed190ed
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import os
import tempfile
from io import StringIO

import joblib
import numpy as np
import pandas as pd

# page set up
import streamlit as st
from b3clf.descriptor_padel import compute_descriptors
from b3clf.geometry_opt import geometry_optimize
from b3clf.utils import (
    get_descriptors,
    predict_permeability,
    scale_descriptors,
    select_descriptors,
)
from streamlit_ketcher import st_ketcher

st.set_page_config(
    page_title="BBB Permeability Prediction with Imbalanced Learning",
    # page_icon="🧊",
    layout="wide",
    # initial_sidebar_state="expanded",
    # menu_items={
    #     'Get Help': 'https://www.extremelycoolapp.com/help',
    #     'Report a bug': "https://www.extremelycoolapp.com/bug",
    #     'About': "# This is a header. This is an *extremely* cool app!"
    # }
)

# Load the pre-trained model and feature scaler
model = joblib.load("pre_trained/b3clf_knn_kmeans_SMOTE.joblib")
scaler = joblib.load("pre_trained/b3clf_scaler.joblib")


# Define a function to generate predictions
# def generate_predictions(file):
#     # Read the input file
#     if file.type == "text/csv":
#         df = pd.read_csv(file)
#     elif file.type == "chemical/x-mdl-sdfile":
#         df = pd.read_sdf(file)
#     else:
#         st.error("Invalid file type. Please upload a CSV or SDF file.")
#         return

#     # Compute the molecular geometry, calculate the features, and perform the predictions
#     X = df.drop("ID", axis=1)
#     X_scaled = scaler.transform(X)
#     y_pred_proba = model.predict_proba(X_scaled)[:, 1]
#     y_pred = model.predict(X_scaled)

#     # Create a DataFrame with the predictions
#     results = pd.DataFrame({"ID": df["ID"], "B3clf_predicted_probability": y_pred_proba, "B3clf_predicted_label": y_pred})

#     return results

keep_features = "no"
keep_sdf = "no"
classifiers_dict = {
    "decision trees": "dtree",
    "kNN": "knn",
    "logsistical regression": "logreg",
    "XGBoost": "xgb",
}
resample_methods_dict = {
    "random undersampling": "classic_RandUndersampling",
    "SMOTE": "classic_SMOTE",
    "Borderline SMOTE": "borderline_SMOTE",
    "k-means SMOTE": "kmeans_SMOTE",
    "ADASYN": "classic_ADASYN",
    "no resampling": "common",
}


def generate_predictions(
    input_fname: str,
    sep: str = "\s+|\t+",
    clf: str = "xgb",
    sampling: str = "classic_ADASYN",
    time_per_mol: int = 120,
):
    """
    Generate predictions for a given input file.
    """
    # mol_tag = os.path.splitext(uploaded_file.name)[0]
    # uploaded_file = uploaded_file.read().decode("utf-8")
    mol_tag = os.path.basename(input_fname).split(".")[0]
    internal_sdf = f"{mol_tag}_optimized_3d.sdf"

    # Geometry optimization
    # Input:
    # * Either an SDF file with molecular geometries or a text file with SMILES strings

    geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)

    df_features = compute_descriptors(
        sdf_file=internal_sdf,
        excel_out=None,
        output_csv=None,
        timeout=None,
        time_per_molecule=time_per_mol,
    )
    # st.write(df_features)

    # Get computed descriptors
    X_features, info_df = get_descriptors(df=df_features)

    # Select descriptors
    X_features = select_descriptors(df=X_features)

    # Scale descriptors
    X_features = scale_descriptors(df=X_features)

    # Get classifier
    # clf = get_clf(clf_str=clf, sampling_str=sampling)

    # Get classifier
    result_df = predict_permeability(
        clf_str=clf,
        sampling_str=sampling,
        features_df=X_features,
        info_df=info_df,
        threshold="none",
    )

    # Get classifier
    display_cols = [
        "ID",
        "SMILES",
        "B3clf_predicted_probability",
        "B3clf_predicted_label",
    ]

    result_df = result_df[
        [col for col in result_df.columns.to_list() if col in display_cols]
    ]

    os.remove(internal_sdf)

    return X_features, result_df


# Create the Streamlit app
st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
info_column, upload_column = st.columns(2)

# Create a file uploader

with upload_column:
    st.subheader("Molecule Input")
    with st.container():
        # uneven columns
        # st.columns((2, 1, 1, 1))
        # two subcolumns for sample input files
        sample_sdf_column, classifier_col = st.columns(2)
        with sample_sdf_column:
            # download sample sdf
            with open("sample_input.sdf", "r") as file_sdf:
                btn = st.download_button(
                    label="Download SDF sample file",
                    data=file_sdf,
                    file_name="sample_input.sdf",
                )
        with classifier_col:
            classifier = st.selectbox(
                label="Classification algorithm:",
                options=("XGBoost", "kNN", "decision trees", "logsistical regression"),
            )

        sample_smiles_column, resampler_col = st.columns(2)
        with sample_smiles_column:
            # download sample smiles
            with open("sample_input_smiles.csv", "r") as file_smi:
                btn = st.download_button(
                    label="Download SMILES sample file",
                    data=file_smi,
                    file_name="sample_input_smiles.csv",
                )
        with resampler_col:
            resampler = st.selectbox(
                label="Resampling method:",
                options=(
                    "ADASYN",
                    "random undersampling",
                    "Borderline SMOTE",
                    "k-means SMOTE",
                    "SMOTE",
                    "no resampling",
                ),
            )

        # horizontal line
        st.divider()
        file = st.file_uploader(
            label="Upload a CSV, SDF or TXT file",
            type=["csv", "sdf", "txt"],
            help="Input molecule file and only text files are supported.",
            # accept_multiple_files=False,
        )
# st.write("The content of the file will be displayed below once uploaded.")
# if file:
# if "csv" in file.name or "txt" in file.name:
#     st.write(file.read().decode("utf-8"))
# st.write(file)

with info_column:
    st.subheader("About `B3clf`")
    # fmt: off
    st.markdown(
        """
        `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. Source code is available at https://github.com/theochem/B3clf.""" #
    )
    # fmt: on

feature_column, prediction_column = st.columns(2)
with feature_column:
    st.subheader("Features")

    placeholder_features = st.empty()
    # placeholder_features = pd.DataFrame(index=[1, 2, 3, 4],
    #                                     columns=["ID", "nAcid", "ALogP", "Alogp2",
    #                                              "AMR", "naAromAtom", "nH", "nN"])
    # st.dataframe(placeholder_features)
    # placeholder_features.text("molecular features")

with prediction_column:
    st.subheader("Predictions")
    # placeholder_predictions = st.empty()
    # placeholder_predictions.text("prediction")


# Generate predictions when the user uploads a file
if file:
    temp_dir = tempfile.mkdtemp()
    # Create a temporary file path for the uploaded file
    temp_file_path = os.path.join(temp_dir, file.name)
    # Save the uploaded file to the temporary file path
    with open(temp_file_path, "wb") as temp_file:
        temp_file.write(file.read())
    # X_features, results = generate_predictions(temp_file_path)
    X_features, results = generate_predictions(
        input_fname=temp_file_path,
        sep="\s+|\t+",
        clf=classifiers_dict[classifier],
        sampling=resample_methods_dict[resampler],
        time_per_mol=120,
    )

    # feture table
    with feature_column:
        st.dataframe(X_features)
        # placeholder_features.dataframe(X_features, hide_index=False)
        feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
        features_csv = X_features.to_csv(index=True)
        st.download_button(
            "Download features as CSV",
            data=features_csv,
            file_name=feature_file_name,
        )

    # prediction table
    with prediction_column:
        # st.subheader("Predictions")
        if results is not None:
            # Display the predictions in a table
            st.dataframe(results, hide_index=True)
            # Add a button to download the predictions as a CSV file
            predictions_csv = results.to_csv(index=True)
            results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv"
            st.download_button(
                "Download predictions as CSV",
                data=predictions_csv,
                file_name=results_file_name,
            )

# hide footer
# https://github.com/streamlit/streamlit/issues/892
hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True)