Spaces:

legend1234
/

b3clf_hf

Sleeping

App Files Files Community

legend1234 commited on Oct 10, 2023

Commit

cf4c3c3

•

1 Parent(s): 9992ded

Attempt to incorporate session state

Browse files

Files changed (1) hide show

app.py +136 -213

app.py CHANGED Viewed

@@ -16,15 +16,19 @@ from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
 from streamlit_extras.let_it_rain import rain
 from streamlit_ketcher import st_ketcher
 st.set_page_config(
     page_title="BBB Permeability Prediction with Imbalanced Learning",
     # page_icon="🧊",
     layout="wide",
     # initial_sidebar_state="expanded",
     # menu_items={
-    #     'Get Help': 'https://www.extremelycoolapp.com/help',
-    #     'Report a bug': "https://www.extremelycoolapp.com/bug",
-    #     'About': "# This is a header. This is an *extremely* cool app!"
     # }
 )
@@ -53,156 +57,19 @@ mol_features = None
 info_df = None
 results = None
 temp_file_path = None
-@st.cache_data
-def load_all_models():
-    """Get b3clf fitted classifier"""
-    clf_list = ["dtree", "knn", "logreg", "xgb"]
-    sampling_list = [
-        "borderline_SMOTE",
-        "classic_ADASYN",
-        "classic_RandUndersampling",
-        "classic_SMOTE",
-        "kmeans_SMOTE",
-        "common",
-    ]
-    model_dict = {}
-    package_name = "b3clf"
-    for clf_str, sampling_str in it.product(clf_list, sampling_list):
-        # joblib_fpath = os.path.join(
-        #     dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str))
-        # pred_model = joblib.load(joblib_fpath)
-        joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib"
-        with pkg_resources.resource_stream(package_name, joblib_path_str) as f:
-            pred_model = joblib.load(f)
-        model_dict[clf_str + "_" + sampling_str] = pred_model
-    return model_dict
-@st.cache_resource
-def predict_permeability(clf_str, sampling_str, mol_features, info_df, threshold="none"):
-    """Compute permeability prediction for given feature data."""
-    # load the model
-    pred_model = load_all_models()[clf_str + "_" + sampling_str]
-    # load the threshold data
-    package_name = "b3clf"
-    with pkg_resources.resource_stream(
-        package_name, "data/B3clf_thresholds.xlsx"
-    ) as f:
-        df_thres = pd.read_excel(f, index_col=0, engine="openpyxl")
-    # default threshold is 0.5
-    label_pool = np.zeros(mol_features.shape[0], dtype=int)
-    if type(mol_features) == pd.DataFrame:
-        if mol_features.index.tolist() != info_df.index.tolist():
-            raise ValueError(
-                "Features_df and Info_df do not have the same index."
-            )
-    # get predicted probabilities
-    info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba(mol_features)[
-        :, 1
-    ]
-    # get predicted label from probability using the threshold
-    mask = np.greater_equal(
-        info_df["B3clf_predicted_probability"].to_numpy(),
-        # df_thres.loc[clf_str + "-" + sampling_str, threshold])
-        df_thres.loc["xgb-classic_ADASYN", threshold],
-    )
-    label_pool[mask] = 1
-    # save the predicted labels
-    info_df["B3clf_predicted_label"] = label_pool
-    info_df.reset_index(inplace=True)
-    return info_df
-# @st.cache_resource
-def generate_predictions(
-    input_fname: str = None,
-    sep: str = "\s+|\t+",
-    clf: str = "xgb",
-    sampling: str = "classic_ADASYN",
-    time_per_mol: int = 120,
-    mol_features: pd.DataFrame = None,
-    info_df: pd.DataFrame = None,
-):
-    """
-    Generate predictions for a given input file.
-    """
-    if mol_features is None and info_df is None:
-        # mol_tag = os.path.splitext(uploaded_file.name)[0]
-        # uploaded_file = uploaded_file.read().decode("utf-8")
-        mol_tag = os.path.basename(input_fname).split(".")[0]
-        internal_sdf = f"{mol_tag}_optimized_3d.sdf"
-        # Geometry optimization
-        # Input:
-        # * Either an SDF file with molecular geometries or a text file with SMILES strings
-        geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
-        df_features = compute_descriptors(
-            sdf_file=internal_sdf,
-            excel_out=None,
-            output_csv=None,
-            timeout=None,
-            time_per_molecule=time_per_mol,
-        )
-        # st.write(df_features)
-        # Get computed descriptors
-        mol_features, info_df = get_descriptors(df=df_features)
-        # Select descriptors
-        mol_features = select_descriptors(df=mol_features)
-        # Scale descriptors
-        mol_features.iloc[:, :] = scale_descriptors(df=mol_features)
-        # this is problematic for using the same file for calculation
-        if os.path.exists(internal_sdf) and keep_sdf == "no":
-            os.remove(internal_sdf)
-    # Get classifier
-    # clf = get_clf(clf_str=clf, sampling_str=sampling)
-    # Get classifier
-    result_df = predict_permeability(
-        clf_str=clf,
-        sampling_str=sampling,
-        mol_features=mol_features,
-        info_df=info_df,
-        threshold="none",
-    )
-    # Get classifier
-    display_cols = [
-        "ID",
-        "SMILES",
-        "B3clf_predicted_probability",
-        "B3clf_predicted_label",
-    ]
-    result_df = result_df[
-        [col for col in result_df.columns.to_list() if col in display_cols]
-    ]
-    return mol_features, info_df, result_df
 # Create the Streamlit app
 st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
 info_column, upload_column = st.columns(2)
 # download sample files
 with info_column:
     st.subheader("About `B3clf`")
@@ -212,10 +79,10 @@ with info_column:
         `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf. This project is supported by Digital Research Alliance of Canada (originally known as Compute Canada) and NSERC. This project is maintained by QC-Dev comminity. For further information and inquiries please contact us at qcdevs@gmail.com."""
     )
     st.text(" \n")
-    # text_body = '''
     # `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf.
-    # '''
-    # st.markdown(f'<p align="justify">{text_body}</p>',
     #             unsafe_allow_html=True)
     # image = Image.open("images/b3clf_workflow.png")
@@ -224,7 +91,7 @@ with info_column:
     # image_path = "images/b3clf_workflow.png"
     # image_width_percent = 80
     # info_column.markdown(
-    #     f'<img src="{image_path}" style="max-width: {image_width_percent}%; height: auto;">',
     #     unsafe_allow_html=True
     #     )
@@ -280,12 +147,42 @@ with upload_column:
         upload_col, _, submit_job_col, _ = st.columns((4, 0.05, 1, 0.05))
         # upload file column
         with upload_col:
-            file = st.file_uploader(
                 label="Upload a CSV, SDF, TXT or SMI file",
                 type=["csv", "sdf", "txt", "smi"],
                 help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
                 accept_multiple_files=False,
             )
         # submit job column
         with submit_job_col:
             st.text(" \n")
@@ -295,9 +192,9 @@ with upload_column:
                 unsafe_allow_html=True,
             )
             submit_job_button = st.button(
-                label="Submit Job", key="submit_job_button", type="secondary"
             )
-        # submit_job_col.markdown("<div style='display: flex; justify-content: center;'>",
         #                         unsafe_allow_html=True)
         # submit_job_button = submit_job_col.button(
         #     label="Submit job", key="submit_job_button", type="secondary"
@@ -329,69 +226,95 @@ with prediction_column:
     # placeholder_predictions.text("prediction")
 # Generate predictions when the user uploads a file
-if submit_job_button:
-    if file and mol_features is None and info_df is None:
         temp_dir = tempfile.mkdtemp()
         # Create a temporary file path for the uploaded file
-        temp_file_path = os.path.join(temp_dir, file.name)
         # Save the uploaded file to the temporary file path
         with open(temp_file_path, "wb") as temp_file:
-            temp_file.write(file.read())
-        # mol_features, results = generate_predictions(temp_file_path)
-    mol_features, info_df, results = generate_predictions(
-        input_fname=temp_file_path,
-        sep="\s+|\t+",
-        clf=classifiers_dict[classifier],
-        sampling=resample_methods_dict[resampler],
-        time_per_mol=120,
-        mol_features=mol_features,
-        info_df=info_df,
-    )
-    st.balloons()
-# feture table
-with feature_column:
-    if mol_features is not None:
-        selected_feature_rows = np.min(
-            [mol_features.shape[0], pandas_display_options["line_limit"]]
         )
-        st.dataframe(mol_features.iloc[:selected_feature_rows, :], hide_index=False)
-        # placeholder_features.dataframe(mol_features, hide_index=False)
-        feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
-        features_csv = mol_features.to_csv(index=True)
-        st.download_button(
-            "Download features as CSV",
-            data=features_csv,
-            file_name=feature_file_name,
         )
-# prediction table
-with prediction_column:
-    # st.subheader("Predictions")
-    if results is not None:
-        # Display the predictions in a table
-        selected_result_rows = np.min(
-            [results.shape[0], pandas_display_options["line_limit"]]
-        )
-        results_df_display = results.iloc[
-            :selected_result_rows, :
-        ].style.format({"B3clf_predicted_probability": "{:.6f}".format})
-        st.dataframe(results_df_display, hide_index=True)
-        # Add a button to download the predictions as a CSV file
-        predictions_csv = results.to_csv(index=True)
-        results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv"
-        st.download_button(
-            "Download predictions as CSV",
-            data=predictions_csv,
-            file_name=results_file_name,
-        )
-        # indicate the success of the job
-        # rain(
-        #     emoji="🎈",
-        #     font_size=54,
-        #     falling_speed=5,
-        #     animation_length=10,
-        # )
 # hide footer
@@ -412,9 +335,9 @@ st.markdown(
     <script>
       window.dataLayer = window.dataLayer || [];
       function gtag(){dataLayer.push(arguments);}
-      gtag('js', new Date());
-      gtag('config', 'G-WG8QYRELP9');
     </script>
     """,
     unsafe_allow_html=True,

 from streamlit_extras.let_it_rain import rain
 from streamlit_ketcher import st_ketcher
+from utils import generate_predictions, load_all_models
+st.cache_data.clear()
 st.set_page_config(
     page_title="BBB Permeability Prediction with Imbalanced Learning",
     # page_icon="🧊",
     layout="wide",
     # initial_sidebar_state="expanded",
     # menu_items={
+    #     "Get Help": "https://www.extremelycoolapp.com/help",
+    #     "Report a bug": "https://www.extremelycoolapp.com/bug",
+    #     "About": "# This is a header. This is an *extremely* cool app!"
     # }
 )
 info_df = None
 results = None
 temp_file_path = None
+all_models = load_all_models()
 # Create the Streamlit app
 st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
 info_column, upload_column = st.columns(2)
+# inatialize the molecule features and info dataframe session state
+if "mol_features" not in st.session_state:
+    st.session_state.mol_features = None
+if "info_df" not in st.session_state:
+    st.session_state.info_df = None
 # download sample files
 with info_column:
     st.subheader("About `B3clf`")
         `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf. This project is supported by Digital Research Alliance of Canada (originally known as Compute Canada) and NSERC. This project is maintained by QC-Dev comminity. For further information and inquiries please contact us at qcdevs@gmail.com."""
     )
     st.text(" \n")
+    # text_body = """
     # `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf.
+    # """
+    # st.markdown(f"<p align="justify">{text_body}</p>",
     #             unsafe_allow_html=True)
     # image = Image.open("images/b3clf_workflow.png")
     # image_path = "images/b3clf_workflow.png"
     # image_width_percent = 80
     # info_column.markdown(
+    #     f"<img src="{image_path}" style="max-width: {image_width_percent}%; height: auto;">",
     #     unsafe_allow_html=True
     #     )
         upload_col, _, submit_job_col, _ = st.columns((4, 0.05, 1, 0.05))
         # upload file column
         with upload_col:
+            # session state tracking of the file uploader
+            if "uploaded_file" not in st.session_state:
+                st.session_state.uploaded_file = None
+            if "uploaded_file_changed" not in st.session_state:
+                st.session_state.uploaded_file_changed = False
+            # def update_uploader_session_info():
+            #     """Update the session state of the file uploader."""
+            #     st.session_state.uploaded_file = uploaded_file
+            uploaded_file = st.file_uploader(
                 label="Upload a CSV, SDF, TXT or SMI file",
                 type=["csv", "sdf", "txt", "smi"],
                 help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
                 accept_multiple_files=False,
+                # key="uploaded_file",
+                # on_change=update_uploader_session_info,
             )
+            if uploaded_file:
+                # st.write(f"the uploaded file: {uploaded_file}")
+                # when new file is uploaded is different from the previous one
+                if st.session_state.uploaded_file != uploaded_file:
+                    st.session_state.uploaded_file_changed = True
+                else:
+                    st.session_state.uploaded_file_changed = False
+                st.session_state.uploaded_file = uploaded_file
+                # when new file is the same as the previous one
+                # else:
+                #     st.session_state.uploaded_file_changed = False
+                # st.session_state.uploaded_file = uploaded_file
+            # set session state for the file uploader
+            # st.write(f"the state of uploaded file: {st.session_state.uploaded_file}")
+            # st.write(f"the state of uploaded file changed: {st.session_state.uploaded_file_changed}")
         # submit job column
         with submit_job_col:
             st.text(" \n")
                 unsafe_allow_html=True,
             )
             submit_job_button = st.button(
+                label="Submit Job", type="secondary", key="job_button"
             )
+        # submit_job_col.markdown("<div style="display: flex; justify-content: center;">",
         #                         unsafe_allow_html=True)
         # submit_job_button = submit_job_col.button(
         #     label="Submit job", key="submit_job_button", type="secondary"
     # placeholder_predictions.text("prediction")
+st.write(
+    f"the state of uploaded file changed before checking: {st.session_state.uploaded_file_changed}"
+)
 # Generate predictions when the user uploads a file
+# if submit_job_button:
+print(st.session_state)
+if "job_button" in st.session_state:
+    # when new file is uploaded
+    # update_uploader_session_info()
+    st.write(
+        f"the state of uploaded file changed after checking: {st.session_state.uploaded_file_changed}"
+    )
+    if st.session_state.uploaded_file_changed:
         temp_dir = tempfile.mkdtemp()
         # Create a temporary file path for the uploaded file
+        temp_file_path = os.path.join(temp_dir, uploaded_file.name)
         # Save the uploaded file to the temporary file path
         with open(temp_file_path, "wb") as temp_file:
+            temp_file.write(uploaded_file.read())
+        mol_features, info_df, results = generate_predictions(
+            input_fname=temp_file_path,
+            sep="\s+|\t+",
+            clf=classifiers_dict[classifier],
+            _models_dict=all_models,
+            sampling=resample_methods_dict[resampler],
+            time_per_mol=120,
+            mol_features=None,
+            info_df=None,
         )
+        st.session_state.mol_features = mol_features
+        st.session_state.info_df = info_df
+    else:
+        mol_features, info_df, results = generate_predictions(
+            input_fname=None,
+            sep="\s+|\t+",
+            clf=classifiers_dict[classifier],
+            _models_dict=all_models,
+            sampling=resample_methods_dict[resampler],
+            time_per_mol=120,
+            mol_features=st.session_state.mol_features,
+            info_df=st.session_state.info_df,
         )
+    # feture table
+    with feature_column:
+        if mol_features is not None:
+            selected_feature_rows = np.min(
+                [mol_features.shape[0], pandas_display_options["line_limit"]]
+            )
+            st.dataframe(mol_features.iloc[:selected_feature_rows, :], hide_index=False)
+            # placeholder_features.dataframe(mol_features, hide_index=False)
+            feature_file_name = uploaded_file.name.split(".")[0] + "_b3clf_features.csv"
+            features_csv = mol_features.to_csv(index=True)
+            st.download_button(
+                "Download features as CSV",
+                data=features_csv,
+                file_name=feature_file_name,
+            )
+    # prediction table
+    with prediction_column:
+        # st.subheader("Predictions")
+        if results is not None:
+            # Display the predictions in a table
+            selected_result_rows = np.min(
+                [results.shape[0], pandas_display_options["line_limit"]]
+            )
+            results_df_display = results.iloc[:selected_result_rows, :].style.format(
+                {"B3clf_predicted_probability": "{:.6f}".format}
+            )
+            st.dataframe(results_df_display, hide_index=True)
+            # Add a button to download the predictions as a CSV file
+            predictions_csv = results.to_csv(index=True)
+            results_file_name = (
+                uploaded_file.name.split(".")[0] + "_b3clf_predictions.csv"
+            )
+            st.download_button(
+                "Download predictions as CSV",
+                data=predictions_csv,
+                file_name=results_file_name,
+            )
+            # indicate the success of the job
+            # rain(
+            #     emoji="🎈",
+            #     font_size=54,
+            #     falling_speed=5,
+            #     animation_length=10,
+            # )
+    st.balloons()
 # hide footer
     <script>
       window.dataLayer = window.dataLayer || [];
       function gtag(){dataLayer.push(arguments);}
+      gtag("js", new Date());
+      gtag("config", "G-WG8QYRELP9");
     </script>
     """,
     unsafe_allow_html=True,