Spaces:

legend1234
/

b3clf_hf

Sleeping

App Files Files Community

legend1234 commited on Oct 9, 2023

Commit

95b3113

•

1 Parent(s): 9c908f4

Refactor the initialization of molecular feature matrix

Browse files

Files changed (1) hide show

app.py +48 -45

app.py CHANGED Viewed

@@ -5,18 +5,12 @@ from io import StringIO
 import joblib
 import numpy as np
 import pandas as pd
 # page set up
 import streamlit as st
 from b3clf.descriptor_padel import compute_descriptors
 from b3clf.geometry_opt import geometry_optimize
-from b3clf.utils import (
-    get_descriptors,
-    predict_permeability,
-    scale_descriptors,
-    select_descriptors,
-)
 # from PIL import Image
 from streamlit_extras.let_it_rain import rain
 from streamlit_ketcher import st_ketcher
@@ -54,47 +48,56 @@ resample_methods_dict = {
 pandas_display_options = {
     "line_limit": 50,
 }
 # @st.cache_resource
 def generate_predictions(
-    input_fname: str,
     sep: str = "\s+|\t+",
     clf: str = "xgb",
     sampling: str = "classic_ADASYN",
     time_per_mol: int = 120,
 ):
     """
     Generate predictions for a given input file.
     """
-    # mol_tag = os.path.splitext(uploaded_file.name)[0]
-    # uploaded_file = uploaded_file.read().decode("utf-8")
-    mol_tag = os.path.basename(input_fname).split(".")[0]
-    internal_sdf = f"{mol_tag}_optimized_3d.sdf"
-    # Geometry optimization
-    # Input:
-    # * Either an SDF file with molecular geometries or a text file with SMILES strings
-    geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
-    df_features = compute_descriptors(
-        sdf_file=internal_sdf,
-        excel_out=None,
-        output_csv=None,
-        timeout=None,
-        time_per_molecule=time_per_mol,
-    )
-    # st.write(df_features)
-    # Get computed descriptors
-    X_features, info_df = get_descriptors(df=df_features)
-    # Select descriptors
-    X_features = select_descriptors(df=X_features)
-    # Scale descriptors
-    X_features = scale_descriptors(df=X_features)
     # Get classifier
     # clf = get_clf(clf_str=clf, sampling_str=sampling)
@@ -103,7 +106,7 @@ def generate_predictions(
     result_df = predict_permeability(
         clf_str=clf,
         sampling_str=sampling,
-        features_df=X_features,
         info_df=info_df,
         threshold="none",
     )
@@ -120,9 +123,7 @@ def generate_predictions(
         [col for col in result_df.columns.to_list() if col in display_cols]
     ]
-    os.remove(internal_sdf)
-    return X_features, result_df
 # Create the Streamlit app
@@ -207,7 +208,7 @@ with upload_column:
         # upload file column
         with upload_col:
             file = st.file_uploader(
-                label="Upload a CSV, SDF or TXT file",
                 type=["csv", "sdf", "txt", "smi"],
                 help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
                 accept_multiple_files=False,
@@ -264,24 +265,26 @@ if submit_job_button:
         # Save the uploaded file to the temporary file path
         with open(temp_file_path, "wb") as temp_file:
             temp_file.write(file.read())
-        # X_features, results = generate_predictions(temp_file_path)
-        X_features, results = generate_predictions(
             input_fname=temp_file_path,
             sep="\s+|\t+",
             clf=classifiers_dict[classifier],
             sampling=resample_methods_dict[resampler],
             time_per_mol=120,
         )
         # feture table
         with feature_column:
             selected_feature_rows = np.min(
-                [X_features.shape[0], pandas_display_options["line_limit"]]
             )
-            st.dataframe(X_features.iloc[:selected_feature_rows, :], hide_index=False)
-            # placeholder_features.dataframe(X_features, hide_index=False)
             feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
-            features_csv = X_features.to_csv(index=True)
             st.download_button(
                 "Download features as CSV",
                 data=features_csv,

 import joblib
 import numpy as np
 import pandas as pd
 # page set up
 import streamlit as st
 from b3clf.descriptor_padel import compute_descriptors
 from b3clf.geometry_opt import geometry_optimize
+from b3clf.utils import (get_descriptors, predict_permeability,
+                         scale_descriptors, select_descriptors)
 # from PIL import Image
 from streamlit_extras.let_it_rain import rain
 from streamlit_ketcher import st_ketcher
 pandas_display_options = {
     "line_limit": 50,
 }
+mol_features = None
+info_df = None
 # @st.cache_resource
 def generate_predictions(
+    input_fname: str = None,
     sep: str = "\s+|\t+",
     clf: str = "xgb",
     sampling: str = "classic_ADASYN",
     time_per_mol: int = 120,
+    mol_features: pd.DataFrame = None,
+    info_df: pd.DataFrame = None,
 ):
     """
     Generate predictions for a given input file.
     """
+    if mol_features is None and info_df is None:
+        # mol_tag = os.path.splitext(uploaded_file.name)[0]
+        # uploaded_file = uploaded_file.read().decode("utf-8")
+        mol_tag = os.path.basename(input_fname).split(".")[0]
+        internal_sdf = f"{mol_tag}_optimized_3d.sdf"
+        # Geometry optimization
+        # Input:
+        # * Either an SDF file with molecular geometries or a text file with SMILES strings
+        geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
+        df_features = compute_descriptors(
+            sdf_file=internal_sdf,
+            excel_out=None,
+            output_csv=None,
+            timeout=None,
+            time_per_molecule=time_per_mol,
+        )
+        # st.write(df_features)
+        # Get computed descriptors
+        mol_features, info_df = get_descriptors(df=df_features)
+        # Select descriptors
+        mol_features = select_descriptors(df=mol_features)
+        # Scale descriptors
+        mol_features.iloc[:, :] = scale_descriptors(df=mol_features)
+        # this is problematic for using the same file for calculation
+        if os.path.exists(internal_sdf) and keep_sdf == "no":
+            os.remove(internal_sdf)
     # Get classifier
     # clf = get_clf(clf_str=clf, sampling_str=sampling)
     result_df = predict_permeability(
         clf_str=clf,
         sampling_str=sampling,
+        mol_features=mol_features,
         info_df=info_df,
         threshold="none",
     )
         [col for col in result_df.columns.to_list() if col in display_cols]
     ]
+    return mol_features, info_df, result_df
 # Create the Streamlit app
         # upload file column
         with upload_col:
             file = st.file_uploader(
+                label="Upload a CSV, SDF, TXT or SMI file",
                 type=["csv", "sdf", "txt", "smi"],
                 help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
                 accept_multiple_files=False,
         # Save the uploaded file to the temporary file path
         with open(temp_file_path, "wb") as temp_file:
             temp_file.write(file.read())
+        # mol_features, results = generate_predictions(temp_file_path)
+        mol_features, info_df, results = generate_predictions(
             input_fname=temp_file_path,
             sep="\s+|\t+",
             clf=classifiers_dict[classifier],
             sampling=resample_methods_dict[resampler],
             time_per_mol=120,
+            mol_features=mol_features,
+            info_df=info_df,
         )
         # feture table
         with feature_column:
             selected_feature_rows = np.min(
+                [mol_features.shape[0], pandas_display_options["line_limit"]]
             )
+            st.dataframe(mol_features.iloc[:selected_feature_rows, :], hide_index=False)
+            # placeholder_features.dataframe(mol_features, hide_index=False)
             feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
+            features_csv = mol_features.to_csv(index=True)
             st.download_button(
                 "Download features as CSV",
                 data=features_csv,