mgyigit commited on
Commit
a7f8182
·
verified ·
1 Parent(s): 37a12fb

Update src/bin/binding_affinity_estimator.py

Browse files
Files changed (1) hide show
  1. src/bin/binding_affinity_estimator.py +74 -74
src/bin/binding_affinity_estimator.py CHANGED
@@ -12,8 +12,7 @@ sys.path.append('.')
12
 
13
  from sklearn import linear_model
14
  from sklearn.model_selection import KFold
15
- from sklearn.metrics import mean_squared_error,mean_absolute_error
16
- from sklearn.ensemble import RandomForestRegressor
17
  from sklearn.preprocessing import MinMaxScaler
18
 
19
  skempi_vectors_path = None
@@ -22,35 +21,35 @@ representation_name = None
22
  def load_representation(multi_col_representation_vector_file_path):
23
  print("\nLoading representation vectors...\n")
24
  multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
25
- vals = multi_col_representation_vector.iloc[:,1:(len(multi_col_representation_vector.columns))]
26
- original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'),'Vector': pd.Series([], dtype='object')})
27
- for index, row in tqdm.tqdm(vals.iterrows(), total = len(vals)):
28
  list_of_floats = [float(item) for item in list(row)]
29
  original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
30
  return original_values_as_df
31
 
32
  def calc_train_error(X_train, y_train, model):
33
- '''returns in-sample error for already fit model.'''
34
  predictions = model.predict(X_train)
35
  mse = mean_squared_error(y_train, predictions)
36
  mae = mean_absolute_error(y_train, predictions)
37
  corr = scipy.stats.pearsonr(y_train, predictions)
38
- return mse,mae,corr
39
-
40
  def calc_validation_error(X_test, y_test, model):
41
- '''returns out-of-sample error for already fit model.'''
42
  predictions = model.predict(X_test)
43
  mse = mean_squared_error(y_test, predictions)
44
  mae = mean_absolute_error(y_test, predictions)
45
  corr = scipy.stats.pearsonr(y_test, predictions)
46
- return mse,mae,corr
47
-
48
  def calc_metrics(X_train, y_train, X_test, y_test, model):
49
- '''fits model and returns the metrics for in-sample error and out-of-sample error'''
50
  model.fit(X_train, y_train)
51
- train_mse_error,train_mae_error,train_corr = calc_train_error(X_train, y_train, model)
52
- val_mse_error,val_mae_error,val_corr = calc_validation_error(X_test, y_test, model)
53
- return train_mse_error, val_mse_error, train_mae_error, val_mae_error,train_corr,val_corr
54
 
55
  def report_results(
56
  train_mse_error_list,
@@ -62,41 +61,35 @@ def report_results(
62
  train_corr_pval_list,
63
  validation_corr_pval_list,
64
  ):
65
- result_df = pd.DataFrame(
66
- {
67
- "train_mse_error": round(np.mean(train_mse_error_list) * 100, 4),
68
- "train_mse_std": round(np.std(train_mse_error_list) * 100, 4),
69
- "val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4),
70
- "val_mse_std": round(np.std(validation_mse_error_list) * 100, 4),
71
- "train_mae_error": round(np.mean(train_mae_error_list) * 100, 4),
72
- "train_mae_std": round(np.std(train_mae_error_list) * 100, 4),
73
- "val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4),
74
- "val_mae_std": round(np.std(validation_mae_error_list) * 100, 4),
75
- "train_corr": round(np.mean(train_corr_list), 4),
76
- "train_corr_pval": round(np.mean(train_corr_pval_list), 4),
77
- "validation_corr": round(np.mean(validation_corr_list), 4),
78
- "validation_corr_pval": round(np.mean(validation_corr_pval_list), 4),
79
- },
80
- index=[0],
81
- )
82
-
83
- result_detail_df = pd.DataFrame(
84
- {
85
- "train_mse_errors": list(np.multiply(train_mse_error_list, 100)),
86
- "val_mse_errors": list(np.multiply(validation_mse_error_list, 100)),
87
- "train_mae_errors": list(np.multiply(train_mae_error_list, 100)),
88
- "val_mae_errors": list(np.multiply(validation_mae_error_list, 100)),
89
- "train_corrs": list(np.multiply(train_corr_list, 100)),
90
- "train_corr_pvals": list(np.multiply(train_corr_pval_list, 100)),
91
- "validation_corr": list(np.multiply(validation_corr_list, 100)),
92
- "validation_corr_pval": list(np.multiply(validation_corr_pval_list, 100)),
93
- },
94
- index=range(len(train_mse_error_list)),
95
- )
96
- return result_df, result_detail_df
97
-
98
 
99
- def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
100
  K = 10
101
  kf = KFold(n_splits=K, shuffle=True, random_state=42)
102
 
@@ -110,13 +103,15 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
110
  validation_corr_pval_list = []
111
 
112
  data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float)
113
- ppi_affinity_filtered_df = ppi_affinity_df\
114
- [ppi_affinity_df['Protein1'].isin(multiplied_vectors_df['Protein1']) &\
115
- ppi_affinity_df['Protein2'].isin(multiplied_vectors_df['Protein2']) ]
 
116
  target = np.array(ppi_affinity_filtered_df["Affinity"])
117
  scaler = MinMaxScaler()
118
  scaler.fit(target.reshape(-1, 1))
119
  target = scaler.transform(target.reshape(-1, 1))[:, 0]
 
120
  for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):
121
 
122
  # split data
@@ -124,9 +119,9 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
124
  y_train, y_val = target[train_index], target[val_index]
125
 
126
  # instantiate model
127
- reg = regressor_model #linear_model.BayesianRidge()
128
 
129
- # calculate error_list
130
  (
131
  train_mse_error,
132
  val_mse_error,
@@ -136,7 +131,7 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
136
  val_corr,
137
  ) = calc_metrics(X_train, y_train, X_val, y_val, reg)
138
 
139
- # append to appropriate list
140
  train_mse_error_list.append(train_mse_error)
141
  validation_mse_error_list.append(val_mse_error)
142
 
@@ -162,35 +157,40 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
162
 
163
  ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"
164
  ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path)
165
- ppi_affinity_df = pd.read_csv(ppi_affinity_file,sep="\t",header=None)
166
  ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']
167
 
168
- #Calculate vector element-wise multiplication as described in https://academic.oup.com/bioinformatics/article/35/14/i305/5529260
169
-
170
  def calculate_vector_multiplications(skempi_vectors_df):
171
- multiplied_vectors = pd.DataFrame({'Protein1': pd.Series([], dtype='str'),\
172
- 'Protein2': pd.Series([], dtype='str'),\
173
- 'Vector': pd.Series([], dtype='object')})
 
 
174
  print("Element-wise vector multiplications are being calculated")
175
  rep_prot_list = list(skempi_vectors_df['PDB_ID'])
176
- for index,row in tqdm.tqdm(ppi_affinity_df.iterrows()):
 
177
  if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list:
178
- vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID']\
179
- == row['Protein1']]['Vector'])[0]
180
- vec2 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID']\
181
- == row['Protein2']]['Vector'])[0]
182
- multiplied_vec = np.multiply(vec1,vec2)
183
-
184
- multiplied_vectors = multiplied_vectors.\
185
- append({'Protein1':row['Protein1'], 'Protein2':row['Protein2'],\
186
- 'Vector':multiplied_vec},ignore_index = True)
 
187
  return multiplied_vectors
188
 
189
  def predict_affinities_and_report_results():
190
  skempi_vectors_df = load_representation(skempi_vectors_path)
191
  multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df)
192
  model = linear_model.BayesianRidge()
193
- result_df, result_detail_df = predictAffinityWithModel(model,multiplied_vectors_df)
194
- result_df.to_csv(os.path.join(script_dir, r"../results/Affinity_prediction_skempiv1_{0}.csv".format(representation_name)),index=False)
195
- result_detail_df.to_csv(os.path.join(script_dir, r"../results/Affinity_prediction_skempiv1_{0}_detail.csv".format(representation_name)),index=False)
196
 
 
 
 
 
 
 
12
 
13
  from sklearn import linear_model
14
  from sklearn.model_selection import KFold
15
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
 
16
  from sklearn.preprocessing import MinMaxScaler
17
 
18
  skempi_vectors_path = None
 
21
  def load_representation(multi_col_representation_vector_file_path):
22
  print("\nLoading representation vectors...\n")
23
  multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
24
+ vals = multi_col_representation_vector.iloc[:, 1:(len(multi_col_representation_vector.columns))]
25
+ original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'), 'Vector': pd.Series([], dtype='object')})
26
+ for index, row in tqdm.tqdm(vals.iterrows(), total=len(vals)):
27
  list_of_floats = [float(item) for item in list(row)]
28
  original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
29
  return original_values_as_df
30
 
31
  def calc_train_error(X_train, y_train, model):
32
+ '''Returns in-sample error for an already fit model.'''
33
  predictions = model.predict(X_train)
34
  mse = mean_squared_error(y_train, predictions)
35
  mae = mean_absolute_error(y_train, predictions)
36
  corr = scipy.stats.pearsonr(y_train, predictions)
37
+ return mse, mae, corr
38
+
39
  def calc_validation_error(X_test, y_test, model):
40
+ '''Returns out-of-sample error for an already fit model.'''
41
  predictions = model.predict(X_test)
42
  mse = mean_squared_error(y_test, predictions)
43
  mae = mean_absolute_error(y_test, predictions)
44
  corr = scipy.stats.pearsonr(y_test, predictions)
45
+ return mse, mae, corr
46
+
47
  def calc_metrics(X_train, y_train, X_test, y_test, model):
48
+ '''Fits the model and returns the metrics for in-sample and out-of-sample errors.'''
49
  model.fit(X_train, y_train)
50
+ train_mse_error, train_mae_error, train_corr = calc_train_error(X_train, y_train, model)
51
+ val_mse_error, val_mae_error, val_corr = calc_validation_error(X_test, y_test, model)
52
+ return train_mse_error, val_mse_error, train_mae_error, val_mae_error, train_corr, val_corr
53
 
54
  def report_results(
55
  train_mse_error_list,
 
61
  train_corr_pval_list,
62
  validation_corr_pval_list,
63
  ):
64
+ result_summary = {
65
+ "train_mse_error": round(np.mean(train_mse_error_list) * 100, 4),
66
+ "train_mse_std": round(np.std(train_mse_error_list) * 100, 4),
67
+ "val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4),
68
+ "val_mse_std": round(np.std(validation_mse_error_list) * 100, 4),
69
+ "train_mae_error": round(np.mean(train_mae_error_list) * 100, 4),
70
+ "train_mae_std": round(np.std(train_mae_error_list) * 100, 4),
71
+ "val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4),
72
+ "val_mae_std": round(np.std(validation_mae_error_list) * 100, 4),
73
+ "train_corr": round(np.mean(train_corr_list), 4),
74
+ "train_corr_pval": round(np.mean(train_corr_pval_list), 4),
75
+ "validation_corr": round(np.mean(validation_corr_list), 4),
76
+ "validation_corr_pval": round(np.mean(validation_corr_pval_list), 4),
77
+ }
78
+
79
+ result_detail = {
80
+ "train_mse_errors": list(np.multiply(train_mse_error_list, 100)),
81
+ "val_mse_errors": list(np.multiply(validation_mse_error_list, 100)),
82
+ "train_mae_errors": list(np.multiply(train_mae_error_list, 100)),
83
+ "val_mae_errors": list(np.multiply(validation_mae_error_list, 100)),
84
+ "train_corrs": list(np.multiply(train_corr_list, 100)),
85
+ "train_corr_pvals": list(np.multiply(train_corr_pval_list, 100)),
86
+ "validation_corrs": list(np.multiply(validation_corr_list, 100)),
87
+ "validation_corr_pvals": list(np.multiply(validation_corr_pval_list, 100)),
88
+ }
89
+
90
+ return result_summary, result_detail
 
 
 
 
 
 
91
 
92
+ def predictAffinityWithModel(regressor_model, multiplied_vectors_df):
93
  K = 10
94
  kf = KFold(n_splits=K, shuffle=True, random_state=42)
95
 
 
103
  validation_corr_pval_list = []
104
 
105
  data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float)
106
+ ppi_affinity_filtered_df = ppi_affinity_df[
107
+ ppi_affinity_df['Protein1'].isin(multiplied_vectors_df['Protein1']) &
108
+ ppi_affinity_df['Protein2'].isin(multiplied_vectors_df['Protein2'])
109
+ ]
110
  target = np.array(ppi_affinity_filtered_df["Affinity"])
111
  scaler = MinMaxScaler()
112
  scaler.fit(target.reshape(-1, 1))
113
  target = scaler.transform(target.reshape(-1, 1))[:, 0]
114
+
115
  for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):
116
 
117
  # split data
 
119
  y_train, y_val = target[train_index], target[val_index]
120
 
121
  # instantiate model
122
+ reg = regressor_model
123
 
124
+ # calculate errors
125
  (
126
  train_mse_error,
127
  val_mse_error,
 
131
  val_corr,
132
  ) = calc_metrics(X_train, y_train, X_val, y_val, reg)
133
 
134
+ # append to appropriate lists
135
  train_mse_error_list.append(train_mse_error)
136
  validation_mse_error_list.append(val_mse_error)
137
 
 
157
 
158
  ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"
159
  ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path)
160
+ ppi_affinity_df = pd.read_csv(ppi_affinity_file, sep="\t", header=None)
161
  ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']
162
 
 
 
163
  def calculate_vector_multiplications(skempi_vectors_df):
164
+ multiplied_vectors = pd.DataFrame({
165
+ 'Protein1': pd.Series([], dtype='str'),
166
+ 'Protein2': pd.Series([], dtype='str'),
167
+ 'Vector': pd.Series([], dtype='object')
168
+ })
169
  print("Element-wise vector multiplications are being calculated")
170
  rep_prot_list = list(skempi_vectors_df['PDB_ID'])
171
+
172
+ for index, row in tqdm.tqdm(ppi_affinity_df.iterrows()):
173
  if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list:
174
+ vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein1']]['Vector'])[0]
175
+ vec2 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein2']]['Vector'])[0]
176
+ multiplied_vec = np.multiply(vec1, vec2)
177
+
178
+ multiplied_vectors = multiplied_vectors.append({
179
+ 'Protein1': row['Protein1'],
180
+ 'Protein2': row['Protein2'],
181
+ 'Vector': multiplied_vec
182
+ }, ignore_index=True)
183
+
184
  return multiplied_vectors
185
 
186
  def predict_affinities_and_report_results():
187
  skempi_vectors_df = load_representation(skempi_vectors_path)
188
  multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df)
189
  model = linear_model.BayesianRidge()
190
+ result_summary, result_detail = predictAffinityWithModel(model, multiplied_vectors_df)
 
 
191
 
192
+ # Return the results as a dictionary instead of writing to a file
193
+ return {
194
+ "summary": result_summary,
195
+ "detail": result_detail
196
+ }