Spaces:

legend1234
/

b3clf_hf

Sleeping

App Files Files Community

b3clf_hf / app.py

legend1234

Reformat the layour

5bd9791 9 months ago

raw

history blame

9.13 kB

	import os
	import tempfile
	from io import StringIO

	import joblib
	import numpy as np
	import pandas as pd

	# page set up
	import streamlit as st
	from b3clf.descriptor_padel import compute_descriptors
	from b3clf.geometry_opt import geometry_optimize
	from b3clf.utils import (
	get_descriptors,
	predict_permeability,
	scale_descriptors,
	select_descriptors,
	)
	from streamlit_ketcher import st_ketcher

	st.set_page_config(
	page_title="BBB Permeability Prediction with Imbalanced Learning",
	# page_icon="🧊",
	layout="wide",
	# initial_sidebar_state="expanded",
	# menu_items={
	# 'Get Help': 'https://www.extremelycoolapp.com/help',
	# 'Report a bug': "https://www.extremelycoolapp.com/bug",
	# 'About': "# This is a header. This is an extremely cool app!"
	# }
	)

	# Load the pre-trained model and feature scaler
	model = joblib.load("pre_trained/b3clf_knn_kmeans_SMOTE.joblib")
	scaler = joblib.load("pre_trained/b3clf_scaler.joblib")


	# Define a function to generate predictions
	# def generate_predictions(file):
	# # Read the input file
	# if file.type == "text/csv":
	# df = pd.read_csv(file)
	# elif file.type == "chemical/x-mdl-sdfile":
	# df = pd.read_sdf(file)
	# else:
	# st.error("Invalid file type. Please upload a CSV or SDF file.")
	# return

	# # Compute the molecular geometry, calculate the features, and perform the predictions
	# X = df.drop("ID", axis=1)
	# X_scaled = scaler.transform(X)
	# y_pred_proba = model.predict_proba(X_scaled)[:, 1]
	# y_pred = model.predict(X_scaled)

	# # Create a DataFrame with the predictions
	# results = pd.DataFrame({"ID": df["ID"], "B3clf_predicted_probability": y_pred_proba, "B3clf_predicted_label": y_pred})

	# return results

	keep_features = "no"
	keep_sdf = "no"
	classifiers_dict = {
	"decision trees": "dtree",
	"kNN": "knn",
	"logsistical regression": "logreg",
	"XGBoost": "xgb",
	}
	resample_methods_dict = {
	"random undersampling": "classic_RandUndersampling",
	"SMOTE": "classic_SMOTE",
	"Borderline SMOTE": "borderline_SMOTE",
	"k-means SMOTE": "kmeans_SMOTE",
	"ADASYN": "classic_ADASYN",
	"no resampling": "common",
	}


	def generate_predictions(
	input_fname: str,
	sep: str = "\s+\|\t+",
	clf: str = "xgb",
	sampling: str = "classic_ADASYN",
	time_per_mol: int = 120,
	):
	"""
	Generate predictions for a given input file.
	"""
	# mol_tag = os.path.splitext(uploaded_file.name)[0]
	# uploaded_file = uploaded_file.read().decode("utf-8")
	mol_tag = os.path.basename(input_fname).split(".")[0]
	internal_sdf = f"{mol_tag}_optimized_3d.sdf"

	# Geometry optimization
	# Input:
	# * Either an SDF file with molecular geometries or a text file with SMILES strings

	geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)

	df_features = compute_descriptors(
	sdf_file=internal_sdf,
	excel_out=None,
	output_csv=None,
	timeout=None,
	time_per_molecule=time_per_mol,
	)
	# st.write(df_features)

	# Get computed descriptors
	X_features, info_df = get_descriptors(df=df_features)

	# Select descriptors
	X_features = select_descriptors(df=X_features)

	# Scale descriptors
	X_features = scale_descriptors(df=X_features)

	# Get classifier
	# clf = get_clf(clf_str=clf, sampling_str=sampling)

	# Get classifier
	result_df = predict_permeability(
	clf_str=clf,
	sampling_str=sampling,
	features_df=X_features,
	info_df=info_df,
	threshold="none",
	)

	# Get classifier
	display_cols = [
	"ID",
	"SMILES",
	"B3clf_predicted_probability",
	"B3clf_predicted_label",
	]

	result_df = result_df[
	[col for col in result_df.columns.to_list() if col in display_cols]
	]

	os.remove(internal_sdf)

	return X_features, result_df


	# Create the Streamlit app
	st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
	info_column, upload_column = st.columns(2)

	# Create a file uploader

	with upload_column:
	st.subheader("Molecule Input")
	with st.container():
	# uneven columns
	# st.columns((2, 1, 1, 1))
	# two subcolumns for sample input files
	sample_sdf_column, classifier_col = st.columns(2)
	with sample_sdf_column:
	# download sample sdf
	with open("sample_input.sdf", "r") as file_sdf:
	btn = st.download_button(
	label="Download SDF sample file",
	data=file_sdf,
	file_name="sample_input.sdf",
	)
	with classifier_col:
	classifier = st.selectbox(
	label="Classification algorithm:",
	options=("XGBoost", "kNN", "decision trees", "logsistical regression"),
	)

	sample_smiles_column, resampler_col = st.columns(2)
	with sample_smiles_column:
	# download sample smiles
	with open("sample_input_smiles.csv", "r") as file_smi:
	btn = st.download_button(
	label="Download SMILES sample file",
	data=file_smi,
	file_name="sample_input_smiles.csv",
	)
	with resampler_col:
	resampler = st.selectbox(
	label="Resampling method:",
	options=(
	"ADASYN",
	"random undersampling",
	"Borderline SMOTE",
	"k-means SMOTE",
	"SMOTE",
	"no resampling",
	),
	)

	# horizontal line
	st.divider()
	file = st.file_uploader(
	label="Upload a CSV, SDF or TXT file",
	type=["csv", "sdf", "txt"],
	help="Input molecule file and only text files are supported.",
	# accept_multiple_files=False,
	)
	# st.write("The content of the file will be displayed below once uploaded.")
	# if file:
	# if "csv" in file.name or "txt" in file.name:
	# st.write(file.read().decode("utf-8"))
	# st.write(file)

	with info_column:
	st.subheader("About `B3clf`")
	# fmt: off
	st.markdown(
	"""
	`B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. Source code is available at https://github.com/theochem/B3clf.""" #
	)
	# fmt: on

	feature_column, prediction_column = st.columns(2)
	with feature_column:
	st.subheader("Features")

	placeholder_features = st.empty()
	# placeholder_features = pd.DataFrame(index=[1, 2, 3, 4],
	# columns=["ID", "nAcid", "ALogP", "Alogp2",
	# "AMR", "naAromAtom", "nH", "nN"])
	# st.dataframe(placeholder_features)
	# placeholder_features.text("molecular features")

	with prediction_column:
	st.subheader("Predictions")
	# placeholder_predictions = st.empty()
	# placeholder_predictions.text("prediction")


	# Generate predictions when the user uploads a file
	if file:
	temp_dir = tempfile.mkdtemp()
	# Create a temporary file path for the uploaded file
	temp_file_path = os.path.join(temp_dir, file.name)
	# Save the uploaded file to the temporary file path
	with open(temp_file_path, "wb") as temp_file:
	temp_file.write(file.read())
	# X_features, results = generate_predictions(temp_file_path)
	X_features, results = generate_predictions(
	input_fname=temp_file_path,
	sep="\s+\|\t+",
	clf=classifiers_dict[classifier],
	sampling=resample_methods_dict[resampler],
	time_per_mol=120,
	)

	# feture table
	with feature_column:
	st.dataframe(X_features)
	# placeholder_features.dataframe(X_features, hide_index=False)
	feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
	features_csv = X_features.to_csv(index=True)
	st.download_button(
	"Download features as CSV",
	data=features_csv,
	file_name=feature_file_name,
	)

	# prediction table
	with prediction_column:
	# st.subheader("Predictions")
	if results is not None:
	# Display the predictions in a table
	st.dataframe(results, hide_index=True)
	# Add a button to download the predictions as a CSV file
	predictions_csv = results.to_csv(index=True)
	results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv"
	st.download_button(
	"Download predictions as CSV",
	data=predictions_csv,
	file_name=results_file_name,
	)

	# hide footer
	# https://github.com/streamlit/streamlit/issues/892
	hide_streamlit_style = """
	<style>
	#MainMenu {visibility: hidden;}
	footer {visibility: hidden;}
	</style>
	"""
	st.markdown(hide_streamlit_style, unsafe_allow_html=True)