b3clf_hf / utils.py
legend1234's picture
Add utils.py
3d6fbe8
import itertools as it
import os
import joblib
import numpy as np
import pandas as pd
import pkg_resources
import streamlit as st
from b3clf.descriptor_padel import compute_descriptors
from b3clf.geometry_opt import geometry_optimize
from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
@st.cache_resource()
def load_all_models():
"""Get b3clf fitted classifier"""
clf_list = ["dtree", "knn", "logreg", "xgb"]
sampling_list = [
"borderline_SMOTE",
"classic_ADASYN",
"classic_RandUndersampling",
"classic_SMOTE",
"kmeans_SMOTE",
"common",
]
model_dict = {}
package_name = "b3clf"
for clf_str, sampling_str in it.product(clf_list, sampling_list):
# joblib_fpath = os.path.join(
# dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str))
# pred_model = joblib.load(joblib_fpath)
joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib"
with pkg_resources.resource_stream(package_name, joblib_path_str) as f:
pred_model = joblib.load(f)
model_dict[clf_str + "_" + sampling_str] = pred_model
return model_dict
@st.cache_resource
def predict_permeability(
clf_str, sampling_str, _models_dict, mol_features, info_df, threshold="none"
):
"""Compute permeability prediction for given feature data."""
# load the model
# pred_model = load_all_models()[clf_str + "_" + sampling_str]
pred_model = _models_dict[clf_str + "_" + sampling_str]
# load the threshold data
package_name = "b3clf"
with pkg_resources.resource_stream(package_name, "data/B3clf_thresholds.xlsx") as f:
df_thres = pd.read_excel(f, index_col=0, engine="openpyxl")
# default threshold is 0.5
label_pool = np.zeros(mol_features.shape[0], dtype=int)
if type(mol_features) == pd.DataFrame:
if mol_features.index.tolist() != info_df.index.tolist():
raise ValueError("Features_df and Info_df do not have the same index.")
# get predicted probabilities
info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba(
mol_features
)[:, 1]
# get predicted label from probability using the threshold
mask = np.greater_equal(
info_df["B3clf_predicted_probability"].to_numpy(),
# df_thres.loc[clf_str + "-" + sampling_str, threshold])
df_thres.loc["xgb-classic_ADASYN", threshold],
)
label_pool[mask] = 1
# save the predicted labels
info_df["B3clf_predicted_label"] = label_pool
info_df.reset_index(inplace=True)
return info_df
@st.cache_resource
def generate_predictions(
input_fname: str = None,
sep: str = "\s+|\t+",
clf: str = "xgb",
_models_dict: dict = None,
keep_sdf: str = "no",
sampling: str = "classic_ADASYN",
time_per_mol: int = 120,
mol_features: pd.DataFrame = None,
info_df: pd.DataFrame = None,
):
"""
Generate predictions for a given input file.
"""
if mol_features is None and info_df is None:
# mol_tag = os.path.splitext(uploaded_file.name)[0]
# uploaded_file = uploaded_file.read().decode("utf-8")
mol_tag = os.path.basename(input_fname).split(".")[0]
internal_sdf = f"{mol_tag}_optimized_3d.sdf"
# Geometry optimization
# Input:
# * Either an SDF file with molecular geometries or a text file with SMILES strings
geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
df_features = compute_descriptors(
sdf_file=internal_sdf,
excel_out=None,
output_csv=None,
timeout=None,
time_per_molecule=time_per_mol,
)
# Get computed descriptors
mol_features, info_df = get_descriptors(df=df_features)
# Select descriptors
mol_features = select_descriptors(df=mol_features)
# Scale descriptors
mol_features.iloc[:, :] = scale_descriptors(df=mol_features)
# this is problematic for using the same file for calculation
if os.path.exists(internal_sdf) and keep_sdf == "no":
os.remove(internal_sdf)
# Get classifier
# clf = get_clf(clf_str=clf, sampling_str=sampling)
# Get classifier
result_df = predict_permeability(
clf_str=clf,
sampling_str=sampling,
_models_dict=_models_dict,
mol_features=mol_features,
info_df=info_df,
threshold="none",
)
# Get classifier
display_cols = [
"ID",
"SMILES",
"B3clf_predicted_probability",
"B3clf_predicted_label",
]
result_df = result_df[
[col for col in result_df.columns.to_list() if col in display_cols]
]
return mol_features, info_df, result_df