Spaces:

legend1234
/

b3clf_hf

Sleeping

File size: 9,134 Bytes

import os
import tempfile
from io import StringIO

import joblib
import numpy as np
import pandas as pd

# page set up
import streamlit as st
from b3clf.descriptor_padel import compute_descriptors
from b3clf.geometry_opt import geometry_optimize
from b3clf.utils import (
    get_descriptors,
    predict_permeability,
    scale_descriptors,
    select_descriptors,
)
from streamlit_ketcher import st_ketcher

st.set_page_config(
    page_title="BBB Permeability Prediction with Imbalanced Learning",
    # page_icon="🧊",
    layout="wide",
    # initial_sidebar_state="expanded",
    # menu_items={
    #     'Get Help': 'https://www.extremelycoolapp.com/help',
    #     'Report a bug': "https://www.extremelycoolapp.com/bug",
    #     'About': "# This is a header. This is an *extremely* cool app!"
    # }
)

# Load the pre-trained model and feature scaler
model = joblib.load("pre_trained/b3clf_knn_kmeans_SMOTE.joblib")
scaler = joblib.load("pre_trained/b3clf_scaler.joblib")


# Define a function to generate predictions
# def generate_predictions(file):
#     # Read the input file
#     if file.type == "text/csv":
#         df = pd.read_csv(file)
#     elif file.type == "chemical/x-mdl-sdfile":
#         df = pd.read_sdf(file)
#     else:
#         st.error("Invalid file type. Please upload a CSV or SDF file.")
#         return

#     # Compute the molecular geometry, calculate the features, and perform the predictions
#     X = df.drop("ID", axis=1)
#     X_scaled = scaler.transform(X)
#     y_pred_proba = model.predict_proba(X_scaled)[:, 1]
#     y_pred = model.predict(X_scaled)

#     # Create a DataFrame with the predictions
#     results = pd.DataFrame({"ID": df["ID"], "B3clf_predicted_probability": y_pred_proba, "B3clf_predicted_label": y_pred})

#     return results

keep_features = "no"
keep_sdf = "no"
classifiers_dict = {
    "decision trees": "dtree",
    "kNN": "knn",
    "logsistical regression": "logreg",
    "XGBoost": "xgb",
}
resample_methods_dict = {
    "random undersampling": "classic_RandUndersampling",
    "SMOTE": "classic_SMOTE",
    "Borderline SMOTE": "borderline_SMOTE",
    "k-means SMOTE": "kmeans_SMOTE",
    "ADASYN": "classic_ADASYN",
    "no resampling": "common",
}


def generate_predictions(
    input_fname: str,
    sep: str = "\s+|\t+",
    clf: str = "xgb",
    sampling: str = "classic_ADASYN",
    time_per_mol: int = 120,
):
    """
    Generate predictions for a given input file.
    """
    # mol_tag = os.path.splitext(uploaded_file.name)[0]
    # uploaded_file = uploaded_file.read().decode("utf-8")
    mol_tag = os.path.basename(input_fname).split(".")[0]
    internal_sdf = f"{mol_tag}_optimized_3d.sdf"

    # Geometry optimization
    # Input:
    # * Either an SDF file with molecular geometries or a text file with SMILES strings

    geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)

    df_features = compute_descriptors(
        sdf_file=internal_sdf,
        excel_out=None,
        output_csv=None,
        timeout=None,
        time_per_molecule=time_per_mol,
    )
    # st.write(df_features)

    # Get computed descriptors
    X_features, info_df = get_descriptors(df=df_features)

    # Select descriptors
    X_features = select_descriptors(df=X_features)

    # Scale descriptors
    X_features = scale_descriptors(df=X_features)

    # Get classifier
    # clf = get_clf(clf_str=clf, sampling_str=sampling)

    # Get classifier
    result_df = predict_permeability(
        clf_str=clf,
        sampling_str=sampling,
        features_df=X_features,
        info_df=info_df,
        threshold="none",
    )

    # Get classifier
    display_cols = [
        "ID",
        "SMILES",
        "B3clf_predicted_probability",
        "B3clf_predicted_label",
    ]

    result_df = result_df[
        [col for col in result_df.columns.to_list() if col in display_cols]
    ]

    os.remove(internal_sdf)

    return X_features, result_df


# Create the Streamlit app
st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
info_column, upload_column = st.columns(2)

# Create a file uploader

with upload_column:
    st.subheader("Molecule Input")
    with st.container():
        # uneven columns
        # st.columns((2, 1, 1, 1))
        # two subcolumns for sample input files
        sample_sdf_column, classifier_col = st.columns(2)
        with sample_sdf_column:
            # download sample sdf
            with open("sample_input.sdf", "r") as file_sdf:
                btn = st.download_button(
                    label="Download SDF sample file",
                    data=file_sdf,
                    file_name="sample_input.sdf",
                )
        with classifier_col:
            classifier = st.selectbox(
                label="Classification algorithm:",
                options=("XGBoost", "kNN", "decision trees", "logsistical regression"),
            )

        sample_smiles_column, resampler_col = st.columns(2)
        with sample_smiles_column:
            # download sample smiles
            with open("sample_input_smiles.csv", "r") as file_smi:
                btn = st.download_button(
                    label="Download SMILES sample file",
                    data=file_smi,
                    file_name="sample_input_smiles.csv",
                )
        with resampler_col:
            resampler = st.selectbox(
                label="Resampling method:",
                options=(
                    "ADASYN",
                    "random undersampling",
                    "Borderline SMOTE",
                    "k-means SMOTE",
                    "SMOTE",
                    "no resampling",
                ),
            )

        # horizontal line
        st.divider()
        file = st.file_uploader(
            label="Upload a CSV, SDF or TXT file",
            type=["csv", "sdf", "txt"],
            help="Input molecule file and only text files are supported.",
            # accept_multiple_files=False,
        )
# st.write("The content of the file will be displayed below once uploaded.")
# if file:
# if "csv" in file.name or "txt" in file.name:
#     st.write(file.read().decode("utf-8"))
# st.write(file)

with info_column:
    st.subheader("About `B3clf`")
    # fmt: off
    st.markdown(
        """
        `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. Source code is available at https://github.com/theochem/B3clf.""" #
    )
    # fmt: on

feature_column, prediction_column = st.columns(2)
with feature_column:
    st.subheader("Features")

    placeholder_features = st.empty()
    # placeholder_features = pd.DataFrame(index=[1, 2, 3, 4],
    #                                     columns=["ID", "nAcid", "ALogP", "Alogp2",
    #                                              "AMR", "naAromAtom", "nH", "nN"])
    # st.dataframe(placeholder_features)
    # placeholder_features.text("molecular features")

with prediction_column:
    st.subheader("Predictions")
    # placeholder_predictions = st.empty()
    # placeholder_predictions.text("prediction")


# Generate predictions when the user uploads a file
if file:
    temp_dir = tempfile.mkdtemp()
    # Create a temporary file path for the uploaded file
    temp_file_path = os.path.join(temp_dir, file.name)
    # Save the uploaded file to the temporary file path
    with open(temp_file_path, "wb") as temp_file:
        temp_file.write(file.read())
    # X_features, results = generate_predictions(temp_file_path)
    X_features, results = generate_predictions(
        input_fname=temp_file_path,
        sep="\s+|\t+",
        clf=classifiers_dict[classifier],
        sampling=resample_methods_dict[resampler],
        time_per_mol=120,
    )

    # feture table
    with feature_column:
        st.dataframe(X_features)
        # placeholder_features.dataframe(X_features, hide_index=False)
        feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
        features_csv = X_features.to_csv(index=True)
        st.download_button(
            "Download features as CSV",
            data=features_csv,
            file_name=feature_file_name,
        )

    # prediction table
    with prediction_column:
        # st.subheader("Predictions")
        if results is not None:
            # Display the predictions in a table
            st.dataframe(results, hide_index=True)
            # Add a button to download the predictions as a CSV file
            predictions_csv = results.to_csv(index=True)
            results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv"
            st.download_button(
                "Download predictions as CSV",
                data=predictions_csv,
                file_name=results_file_name,
            )

# hide footer
# https://github.com/streamlit/streamlit/issues/892
hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True)