b3clf_hf / app.py
legend1234's picture
Reformat the layour
5bd9791
raw
history blame
9.13 kB
import os
import tempfile
from io import StringIO
import joblib
import numpy as np
import pandas as pd
# page set up
import streamlit as st
from b3clf.descriptor_padel import compute_descriptors
from b3clf.geometry_opt import geometry_optimize
from b3clf.utils import (
get_descriptors,
predict_permeability,
scale_descriptors,
select_descriptors,
)
from streamlit_ketcher import st_ketcher
st.set_page_config(
page_title="BBB Permeability Prediction with Imbalanced Learning",
# page_icon="🧊",
layout="wide",
# initial_sidebar_state="expanded",
# menu_items={
# 'Get Help': 'https://www.extremelycoolapp.com/help',
# 'Report a bug': "https://www.extremelycoolapp.com/bug",
# 'About': "# This is a header. This is an *extremely* cool app!"
# }
)
# Load the pre-trained model and feature scaler
model = joblib.load("pre_trained/b3clf_knn_kmeans_SMOTE.joblib")
scaler = joblib.load("pre_trained/b3clf_scaler.joblib")
# Define a function to generate predictions
# def generate_predictions(file):
# # Read the input file
# if file.type == "text/csv":
# df = pd.read_csv(file)
# elif file.type == "chemical/x-mdl-sdfile":
# df = pd.read_sdf(file)
# else:
# st.error("Invalid file type. Please upload a CSV or SDF file.")
# return
# # Compute the molecular geometry, calculate the features, and perform the predictions
# X = df.drop("ID", axis=1)
# X_scaled = scaler.transform(X)
# y_pred_proba = model.predict_proba(X_scaled)[:, 1]
# y_pred = model.predict(X_scaled)
# # Create a DataFrame with the predictions
# results = pd.DataFrame({"ID": df["ID"], "B3clf_predicted_probability": y_pred_proba, "B3clf_predicted_label": y_pred})
# return results
keep_features = "no"
keep_sdf = "no"
classifiers_dict = {
"decision trees": "dtree",
"kNN": "knn",
"logsistical regression": "logreg",
"XGBoost": "xgb",
}
resample_methods_dict = {
"random undersampling": "classic_RandUndersampling",
"SMOTE": "classic_SMOTE",
"Borderline SMOTE": "borderline_SMOTE",
"k-means SMOTE": "kmeans_SMOTE",
"ADASYN": "classic_ADASYN",
"no resampling": "common",
}
def generate_predictions(
input_fname: str,
sep: str = "\s+|\t+",
clf: str = "xgb",
sampling: str = "classic_ADASYN",
time_per_mol: int = 120,
):
"""
Generate predictions for a given input file.
"""
# mol_tag = os.path.splitext(uploaded_file.name)[0]
# uploaded_file = uploaded_file.read().decode("utf-8")
mol_tag = os.path.basename(input_fname).split(".")[0]
internal_sdf = f"{mol_tag}_optimized_3d.sdf"
# Geometry optimization
# Input:
# * Either an SDF file with molecular geometries or a text file with SMILES strings
geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
df_features = compute_descriptors(
sdf_file=internal_sdf,
excel_out=None,
output_csv=None,
timeout=None,
time_per_molecule=time_per_mol,
)
# st.write(df_features)
# Get computed descriptors
X_features, info_df = get_descriptors(df=df_features)
# Select descriptors
X_features = select_descriptors(df=X_features)
# Scale descriptors
X_features = scale_descriptors(df=X_features)
# Get classifier
# clf = get_clf(clf_str=clf, sampling_str=sampling)
# Get classifier
result_df = predict_permeability(
clf_str=clf,
sampling_str=sampling,
features_df=X_features,
info_df=info_df,
threshold="none",
)
# Get classifier
display_cols = [
"ID",
"SMILES",
"B3clf_predicted_probability",
"B3clf_predicted_label",
]
result_df = result_df[
[col for col in result_df.columns.to_list() if col in display_cols]
]
os.remove(internal_sdf)
return X_features, result_df
# Create the Streamlit app
st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
info_column, upload_column = st.columns(2)
# Create a file uploader
with upload_column:
st.subheader("Molecule Input")
with st.container():
# uneven columns
# st.columns((2, 1, 1, 1))
# two subcolumns for sample input files
sample_sdf_column, classifier_col = st.columns(2)
with sample_sdf_column:
# download sample sdf
with open("sample_input.sdf", "r") as file_sdf:
btn = st.download_button(
label="Download SDF sample file",
data=file_sdf,
file_name="sample_input.sdf",
)
with classifier_col:
classifier = st.selectbox(
label="Classification algorithm:",
options=("XGBoost", "kNN", "decision trees", "logsistical regression"),
)
sample_smiles_column, resampler_col = st.columns(2)
with sample_smiles_column:
# download sample smiles
with open("sample_input_smiles.csv", "r") as file_smi:
btn = st.download_button(
label="Download SMILES sample file",
data=file_smi,
file_name="sample_input_smiles.csv",
)
with resampler_col:
resampler = st.selectbox(
label="Resampling method:",
options=(
"ADASYN",
"random undersampling",
"Borderline SMOTE",
"k-means SMOTE",
"SMOTE",
"no resampling",
),
)
# horizontal line
st.divider()
file = st.file_uploader(
label="Upload a CSV, SDF or TXT file",
type=["csv", "sdf", "txt"],
help="Input molecule file and only text files are supported.",
# accept_multiple_files=False,
)
# st.write("The content of the file will be displayed below once uploaded.")
# if file:
# if "csv" in file.name or "txt" in file.name:
# st.write(file.read().decode("utf-8"))
# st.write(file)
with info_column:
st.subheader("About `B3clf`")
# fmt: off
st.markdown(
"""
`B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. Source code is available at https://github.com/theochem/B3clf.""" #
)
# fmt: on
feature_column, prediction_column = st.columns(2)
with feature_column:
st.subheader("Features")
placeholder_features = st.empty()
# placeholder_features = pd.DataFrame(index=[1, 2, 3, 4],
# columns=["ID", "nAcid", "ALogP", "Alogp2",
# "AMR", "naAromAtom", "nH", "nN"])
# st.dataframe(placeholder_features)
# placeholder_features.text("molecular features")
with prediction_column:
st.subheader("Predictions")
# placeholder_predictions = st.empty()
# placeholder_predictions.text("prediction")
# Generate predictions when the user uploads a file
if file:
temp_dir = tempfile.mkdtemp()
# Create a temporary file path for the uploaded file
temp_file_path = os.path.join(temp_dir, file.name)
# Save the uploaded file to the temporary file path
with open(temp_file_path, "wb") as temp_file:
temp_file.write(file.read())
# X_features, results = generate_predictions(temp_file_path)
X_features, results = generate_predictions(
input_fname=temp_file_path,
sep="\s+|\t+",
clf=classifiers_dict[classifier],
sampling=resample_methods_dict[resampler],
time_per_mol=120,
)
# feture table
with feature_column:
st.dataframe(X_features)
# placeholder_features.dataframe(X_features, hide_index=False)
feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
features_csv = X_features.to_csv(index=True)
st.download_button(
"Download features as CSV",
data=features_csv,
file_name=feature_file_name,
)
# prediction table
with prediction_column:
# st.subheader("Predictions")
if results is not None:
# Display the predictions in a table
st.dataframe(results, hide_index=True)
# Add a button to download the predictions as a CSV file
predictions_csv = results.to_csv(index=True)
results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv"
st.download_button(
"Download predictions as CSV",
data=predictions_csv,
file_name=results_file_name,
)
# hide footer
# https://github.com/streamlit/streamlit/issues/892
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)