import os import tempfile from io import StringIO import joblib import numpy as np import pandas as pd # page set up import streamlit as st from b3clf.descriptor_padel import compute_descriptors from b3clf.geometry_opt import geometry_optimize from b3clf.utils import ( get_descriptors, predict_permeability, scale_descriptors, select_descriptors, ) from streamlit_ketcher import st_ketcher st.set_page_config( page_title="BBB Permeability Prediction with Imbalanced Learning", # page_icon="🧊", layout="wide", # initial_sidebar_state="expanded", # menu_items={ # 'Get Help': 'https://www.extremelycoolapp.com/help', # 'Report a bug': "https://www.extremelycoolapp.com/bug", # 'About': "# This is a header. This is an *extremely* cool app!" # } ) # Load the pre-trained model and feature scaler model = joblib.load("pre_trained/b3clf_knn_kmeans_SMOTE.joblib") scaler = joblib.load("pre_trained/b3clf_scaler.joblib") # Define a function to generate predictions # def generate_predictions(file): # # Read the input file # if file.type == "text/csv": # df = pd.read_csv(file) # elif file.type == "chemical/x-mdl-sdfile": # df = pd.read_sdf(file) # else: # st.error("Invalid file type. Please upload a CSV or SDF file.") # return # # Compute the molecular geometry, calculate the features, and perform the predictions # X = df.drop("ID", axis=1) # X_scaled = scaler.transform(X) # y_pred_proba = model.predict_proba(X_scaled)[:, 1] # y_pred = model.predict(X_scaled) # # Create a DataFrame with the predictions # results = pd.DataFrame({"ID": df["ID"], "B3clf_predicted_probability": y_pred_proba, "B3clf_predicted_label": y_pred}) # return results keep_features = "no" keep_sdf = "no" classifiers_dict = { "decision trees": "dtree", "kNN": "knn", "logsistical regression": "logreg", "XGBoost": "xgb", } resample_methods_dict = { "random undersampling": "classic_RandUndersampling", "SMOTE": "classic_SMOTE", "Borderline SMOTE": "borderline_SMOTE", "k-means SMOTE": "kmeans_SMOTE", "ADASYN": "classic_ADASYN", "no resampling": "common", } def generate_predictions( input_fname: str, sep: str = "\s+|\t+", clf: str = "xgb", sampling: str = "classic_ADASYN", time_per_mol: int = 120, ): """ Generate predictions for a given input file. """ # mol_tag = os.path.splitext(uploaded_file.name)[0] # uploaded_file = uploaded_file.read().decode("utf-8") mol_tag = os.path.basename(input_fname).split(".")[0] internal_sdf = f"{mol_tag}_optimized_3d.sdf" # Geometry optimization # Input: # * Either an SDF file with molecular geometries or a text file with SMILES strings geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep) df_features = compute_descriptors( sdf_file=internal_sdf, excel_out=None, output_csv=None, timeout=None, time_per_molecule=time_per_mol, ) # st.write(df_features) # Get computed descriptors X_features, info_df = get_descriptors(df=df_features) # Select descriptors X_features = select_descriptors(df=X_features) # Scale descriptors X_features = scale_descriptors(df=X_features) # Get classifier # clf = get_clf(clf_str=clf, sampling_str=sampling) # Get classifier result_df = predict_permeability( clf_str=clf, sampling_str=sampling, features_df=X_features, info_df=info_df, threshold="none", ) # Get classifier display_cols = [ "ID", "SMILES", "B3clf_predicted_probability", "B3clf_predicted_label", ] result_df = result_df[ [col for col in result_df.columns.to_list() if col in display_cols] ] os.remove(internal_sdf) return X_features, result_df # Create the Streamlit app st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]") info_column, upload_column = st.columns(2) # Create a file uploader with upload_column: st.subheader("Molecule Input") with st.container(): # uneven columns # st.columns((2, 1, 1, 1)) # two subcolumns for sample input files sample_sdf_column, classifier_col = st.columns(2) with sample_sdf_column: # download sample sdf with open("sample_input.sdf", "r") as file_sdf: btn = st.download_button( label="Download SDF sample file", data=file_sdf, file_name="sample_input.sdf", ) with classifier_col: classifier = st.selectbox( label="Classification algorithm:", options=("XGBoost", "kNN", "decision trees", "logsistical regression"), ) sample_smiles_column, resampler_col = st.columns(2) with sample_smiles_column: # download sample smiles with open("sample_input_smiles.csv", "r") as file_smi: btn = st.download_button( label="Download SMILES sample file", data=file_smi, file_name="sample_input_smiles.csv", ) with resampler_col: resampler = st.selectbox( label="Resampling method:", options=( "ADASYN", "random undersampling", "Borderline SMOTE", "k-means SMOTE", "SMOTE", "no resampling", ), ) # horizontal line st.divider() file = st.file_uploader( label="Upload a CSV, SDF or TXT file", type=["csv", "sdf", "txt"], help="Input molecule file and only text files are supported.", # accept_multiple_files=False, ) # st.write("The content of the file will be displayed below once uploaded.") # if file: # if "csv" in file.name or "txt" in file.name: # st.write(file.read().decode("utf-8")) # st.write(file) with info_column: st.subheader("About `B3clf`") # fmt: off st.markdown( """ `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. Source code is available at https://github.com/theochem/B3clf.""" # ) # fmt: on feature_column, prediction_column = st.columns(2) with feature_column: st.subheader("Features") placeholder_features = st.empty() # placeholder_features = pd.DataFrame(index=[1, 2, 3, 4], # columns=["ID", "nAcid", "ALogP", "Alogp2", # "AMR", "naAromAtom", "nH", "nN"]) # st.dataframe(placeholder_features) # placeholder_features.text("molecular features") with prediction_column: st.subheader("Predictions") # placeholder_predictions = st.empty() # placeholder_predictions.text("prediction") # Generate predictions when the user uploads a file if file: temp_dir = tempfile.mkdtemp() # Create a temporary file path for the uploaded file temp_file_path = os.path.join(temp_dir, file.name) # Save the uploaded file to the temporary file path with open(temp_file_path, "wb") as temp_file: temp_file.write(file.read()) # X_features, results = generate_predictions(temp_file_path) X_features, results = generate_predictions( input_fname=temp_file_path, sep="\s+|\t+", clf=classifiers_dict[classifier], sampling=resample_methods_dict[resampler], time_per_mol=120, ) # feture table with feature_column: st.dataframe(X_features) # placeholder_features.dataframe(X_features, hide_index=False) feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv" features_csv = X_features.to_csv(index=True) st.download_button( "Download features as CSV", data=features_csv, file_name=feature_file_name, ) # prediction table with prediction_column: # st.subheader("Predictions") if results is not None: # Display the predictions in a table st.dataframe(results, hide_index=True) # Add a button to download the predictions as a CSV file predictions_csv = results.to_csv(index=True) results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv" st.download_button( "Download predictions as CSV", data=predictions_csv, file_name=results_file_name, ) # hide footer # https://github.com/streamlit/streamlit/issues/892 hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True)