legend1234 commited on
Commit
95b3113
1 Parent(s): 9c908f4

Refactor the initialization of molecular feature matrix

Browse files
Files changed (1) hide show
  1. app.py +48 -45
app.py CHANGED
@@ -5,18 +5,12 @@ from io import StringIO
5
  import joblib
6
  import numpy as np
7
  import pandas as pd
8
-
9
  # page set up
10
  import streamlit as st
11
  from b3clf.descriptor_padel import compute_descriptors
12
  from b3clf.geometry_opt import geometry_optimize
13
- from b3clf.utils import (
14
- get_descriptors,
15
- predict_permeability,
16
- scale_descriptors,
17
- select_descriptors,
18
- )
19
-
20
  # from PIL import Image
21
  from streamlit_extras.let_it_rain import rain
22
  from streamlit_ketcher import st_ketcher
@@ -54,47 +48,56 @@ resample_methods_dict = {
54
  pandas_display_options = {
55
  "line_limit": 50,
56
  }
 
 
57
 
58
 
59
  # @st.cache_resource
60
  def generate_predictions(
61
- input_fname: str,
62
  sep: str = "\s+|\t+",
63
  clf: str = "xgb",
64
  sampling: str = "classic_ADASYN",
65
  time_per_mol: int = 120,
 
 
66
  ):
67
  """
68
  Generate predictions for a given input file.
69
  """
70
- # mol_tag = os.path.splitext(uploaded_file.name)[0]
71
- # uploaded_file = uploaded_file.read().decode("utf-8")
72
- mol_tag = os.path.basename(input_fname).split(".")[0]
73
- internal_sdf = f"{mol_tag}_optimized_3d.sdf"
74
-
75
- # Geometry optimization
76
- # Input:
77
- # * Either an SDF file with molecular geometries or a text file with SMILES strings
78
-
79
- geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
80
-
81
- df_features = compute_descriptors(
82
- sdf_file=internal_sdf,
83
- excel_out=None,
84
- output_csv=None,
85
- timeout=None,
86
- time_per_molecule=time_per_mol,
87
- )
88
- # st.write(df_features)
 
 
 
 
89
 
90
- # Get computed descriptors
91
- X_features, info_df = get_descriptors(df=df_features)
92
 
93
- # Select descriptors
94
- X_features = select_descriptors(df=X_features)
95
 
96
- # Scale descriptors
97
- X_features = scale_descriptors(df=X_features)
 
98
 
99
  # Get classifier
100
  # clf = get_clf(clf_str=clf, sampling_str=sampling)
@@ -103,7 +106,7 @@ def generate_predictions(
103
  result_df = predict_permeability(
104
  clf_str=clf,
105
  sampling_str=sampling,
106
- features_df=X_features,
107
  info_df=info_df,
108
  threshold="none",
109
  )
@@ -120,9 +123,7 @@ def generate_predictions(
120
  [col for col in result_df.columns.to_list() if col in display_cols]
121
  ]
122
 
123
- os.remove(internal_sdf)
124
-
125
- return X_features, result_df
126
 
127
 
128
  # Create the Streamlit app
@@ -207,7 +208,7 @@ with upload_column:
207
  # upload file column
208
  with upload_col:
209
  file = st.file_uploader(
210
- label="Upload a CSV, SDF or TXT file",
211
  type=["csv", "sdf", "txt", "smi"],
212
  help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
213
  accept_multiple_files=False,
@@ -264,24 +265,26 @@ if submit_job_button:
264
  # Save the uploaded file to the temporary file path
265
  with open(temp_file_path, "wb") as temp_file:
266
  temp_file.write(file.read())
267
- # X_features, results = generate_predictions(temp_file_path)
268
- X_features, results = generate_predictions(
269
  input_fname=temp_file_path,
270
  sep="\s+|\t+",
271
  clf=classifiers_dict[classifier],
272
  sampling=resample_methods_dict[resampler],
273
  time_per_mol=120,
 
 
274
  )
275
 
276
  # feture table
277
  with feature_column:
278
  selected_feature_rows = np.min(
279
- [X_features.shape[0], pandas_display_options["line_limit"]]
280
  )
281
- st.dataframe(X_features.iloc[:selected_feature_rows, :], hide_index=False)
282
- # placeholder_features.dataframe(X_features, hide_index=False)
283
  feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
284
- features_csv = X_features.to_csv(index=True)
285
  st.download_button(
286
  "Download features as CSV",
287
  data=features_csv,
 
5
  import joblib
6
  import numpy as np
7
  import pandas as pd
 
8
  # page set up
9
  import streamlit as st
10
  from b3clf.descriptor_padel import compute_descriptors
11
  from b3clf.geometry_opt import geometry_optimize
12
+ from b3clf.utils import (get_descriptors, predict_permeability,
13
+ scale_descriptors, select_descriptors)
 
 
 
 
 
14
  # from PIL import Image
15
  from streamlit_extras.let_it_rain import rain
16
  from streamlit_ketcher import st_ketcher
 
48
  pandas_display_options = {
49
  "line_limit": 50,
50
  }
51
+ mol_features = None
52
+ info_df = None
53
 
54
 
55
  # @st.cache_resource
56
  def generate_predictions(
57
+ input_fname: str = None,
58
  sep: str = "\s+|\t+",
59
  clf: str = "xgb",
60
  sampling: str = "classic_ADASYN",
61
  time_per_mol: int = 120,
62
+ mol_features: pd.DataFrame = None,
63
+ info_df: pd.DataFrame = None,
64
  ):
65
  """
66
  Generate predictions for a given input file.
67
  """
68
+ if mol_features is None and info_df is None:
69
+ # mol_tag = os.path.splitext(uploaded_file.name)[0]
70
+ # uploaded_file = uploaded_file.read().decode("utf-8")
71
+ mol_tag = os.path.basename(input_fname).split(".")[0]
72
+ internal_sdf = f"{mol_tag}_optimized_3d.sdf"
73
+
74
+ # Geometry optimization
75
+ # Input:
76
+ # * Either an SDF file with molecular geometries or a text file with SMILES strings
77
+
78
+ geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
79
+
80
+ df_features = compute_descriptors(
81
+ sdf_file=internal_sdf,
82
+ excel_out=None,
83
+ output_csv=None,
84
+ timeout=None,
85
+ time_per_molecule=time_per_mol,
86
+ )
87
+ # st.write(df_features)
88
+
89
+ # Get computed descriptors
90
+ mol_features, info_df = get_descriptors(df=df_features)
91
 
92
+ # Select descriptors
93
+ mol_features = select_descriptors(df=mol_features)
94
 
95
+ # Scale descriptors
96
+ mol_features.iloc[:, :] = scale_descriptors(df=mol_features)
97
 
98
+ # this is problematic for using the same file for calculation
99
+ if os.path.exists(internal_sdf) and keep_sdf == "no":
100
+ os.remove(internal_sdf)
101
 
102
  # Get classifier
103
  # clf = get_clf(clf_str=clf, sampling_str=sampling)
 
106
  result_df = predict_permeability(
107
  clf_str=clf,
108
  sampling_str=sampling,
109
+ mol_features=mol_features,
110
  info_df=info_df,
111
  threshold="none",
112
  )
 
123
  [col for col in result_df.columns.to_list() if col in display_cols]
124
  ]
125
 
126
+ return mol_features, info_df, result_df
 
 
127
 
128
 
129
  # Create the Streamlit app
 
208
  # upload file column
209
  with upload_col:
210
  file = st.file_uploader(
211
+ label="Upload a CSV, SDF, TXT or SMI file",
212
  type=["csv", "sdf", "txt", "smi"],
213
  help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
214
  accept_multiple_files=False,
 
265
  # Save the uploaded file to the temporary file path
266
  with open(temp_file_path, "wb") as temp_file:
267
  temp_file.write(file.read())
268
+ # mol_features, results = generate_predictions(temp_file_path)
269
+ mol_features, info_df, results = generate_predictions(
270
  input_fname=temp_file_path,
271
  sep="\s+|\t+",
272
  clf=classifiers_dict[classifier],
273
  sampling=resample_methods_dict[resampler],
274
  time_per_mol=120,
275
+ mol_features=mol_features,
276
+ info_df=info_df,
277
  )
278
 
279
  # feture table
280
  with feature_column:
281
  selected_feature_rows = np.min(
282
+ [mol_features.shape[0], pandas_display_options["line_limit"]]
283
  )
284
+ st.dataframe(mol_features.iloc[:selected_feature_rows, :], hide_index=False)
285
+ # placeholder_features.dataframe(mol_features, hide_index=False)
286
  feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
287
+ features_csv = mol_features.to_csv(index=True)
288
  st.download_button(
289
  "Download features as CSV",
290
  data=features_csv,