Spaces:
Sleeping
Sleeping
Update src/bin/binding_affinity_estimator.py
Browse files
src/bin/binding_affinity_estimator.py
CHANGED
@@ -12,8 +12,7 @@ sys.path.append('.')
|
|
12 |
|
13 |
from sklearn import linear_model
|
14 |
from sklearn.model_selection import KFold
|
15 |
-
from sklearn.metrics import mean_squared_error,mean_absolute_error
|
16 |
-
from sklearn.ensemble import RandomForestRegressor
|
17 |
from sklearn.preprocessing import MinMaxScaler
|
18 |
|
19 |
skempi_vectors_path = None
|
@@ -22,35 +21,35 @@ representation_name = None
|
|
22 |
def load_representation(multi_col_representation_vector_file_path):
|
23 |
print("\nLoading representation vectors...\n")
|
24 |
multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
|
25 |
-
vals = multi_col_representation_vector.iloc[:,1:(len(multi_col_representation_vector.columns))]
|
26 |
-
original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'),'Vector': pd.Series([], dtype='object')})
|
27 |
-
for index, row in tqdm.tqdm(vals.iterrows(), total
|
28 |
list_of_floats = [float(item) for item in list(row)]
|
29 |
original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
|
30 |
return original_values_as_df
|
31 |
|
32 |
def calc_train_error(X_train, y_train, model):
|
33 |
-
'''
|
34 |
predictions = model.predict(X_train)
|
35 |
mse = mean_squared_error(y_train, predictions)
|
36 |
mae = mean_absolute_error(y_train, predictions)
|
37 |
corr = scipy.stats.pearsonr(y_train, predictions)
|
38 |
-
return mse,mae,corr
|
39 |
-
|
40 |
def calc_validation_error(X_test, y_test, model):
|
41 |
-
'''
|
42 |
predictions = model.predict(X_test)
|
43 |
mse = mean_squared_error(y_test, predictions)
|
44 |
mae = mean_absolute_error(y_test, predictions)
|
45 |
corr = scipy.stats.pearsonr(y_test, predictions)
|
46 |
-
return mse,mae,corr
|
47 |
-
|
48 |
def calc_metrics(X_train, y_train, X_test, y_test, model):
|
49 |
-
'''
|
50 |
model.fit(X_train, y_train)
|
51 |
-
train_mse_error,train_mae_error,train_corr = calc_train_error(X_train, y_train, model)
|
52 |
-
val_mse_error,val_mae_error,val_corr = calc_validation_error(X_test, y_test, model)
|
53 |
-
return train_mse_error, val_mse_error, train_mae_error, val_mae_error,train_corr,val_corr
|
54 |
|
55 |
def report_results(
|
56 |
train_mse_error_list,
|
@@ -62,41 +61,35 @@ def report_results(
|
|
62 |
train_corr_pval_list,
|
63 |
validation_corr_pval_list,
|
64 |
):
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
"validation_corr_pval": list(np.multiply(validation_corr_pval_list, 100)),
|
93 |
-
},
|
94 |
-
index=range(len(train_mse_error_list)),
|
95 |
-
)
|
96 |
-
return result_df, result_detail_df
|
97 |
-
|
98 |
|
99 |
-
def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
100 |
K = 10
|
101 |
kf = KFold(n_splits=K, shuffle=True, random_state=42)
|
102 |
|
@@ -110,13 +103,15 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
110 |
validation_corr_pval_list = []
|
111 |
|
112 |
data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float)
|
113 |
-
ppi_affinity_filtered_df = ppi_affinity_df
|
114 |
-
|
115 |
-
|
|
|
116 |
target = np.array(ppi_affinity_filtered_df["Affinity"])
|
117 |
scaler = MinMaxScaler()
|
118 |
scaler.fit(target.reshape(-1, 1))
|
119 |
target = scaler.transform(target.reshape(-1, 1))[:, 0]
|
|
|
120 |
for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):
|
121 |
|
122 |
# split data
|
@@ -124,9 +119,9 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
124 |
y_train, y_val = target[train_index], target[val_index]
|
125 |
|
126 |
# instantiate model
|
127 |
-
reg = regressor_model
|
128 |
|
129 |
-
# calculate
|
130 |
(
|
131 |
train_mse_error,
|
132 |
val_mse_error,
|
@@ -136,7 +131,7 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
136 |
val_corr,
|
137 |
) = calc_metrics(X_train, y_train, X_val, y_val, reg)
|
138 |
|
139 |
-
# append to appropriate
|
140 |
train_mse_error_list.append(train_mse_error)
|
141 |
validation_mse_error_list.append(val_mse_error)
|
142 |
|
@@ -162,35 +157,40 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
162 |
|
163 |
ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"
|
164 |
ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path)
|
165 |
-
ppi_affinity_df = pd.read_csv(ppi_affinity_file,sep="\t",header=None)
|
166 |
ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']
|
167 |
|
168 |
-
#Calculate vector element-wise multiplication as described in https://academic.oup.com/bioinformatics/article/35/14/i305/5529260
|
169 |
-
|
170 |
def calculate_vector_multiplications(skempi_vectors_df):
|
171 |
-
multiplied_vectors = pd.DataFrame({
|
172 |
-
|
173 |
-
|
|
|
|
|
174 |
print("Element-wise vector multiplications are being calculated")
|
175 |
rep_prot_list = list(skempi_vectors_df['PDB_ID'])
|
176 |
-
|
|
|
177 |
if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list:
|
178 |
-
vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID']
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
187 |
return multiplied_vectors
|
188 |
|
189 |
def predict_affinities_and_report_results():
|
190 |
skempi_vectors_df = load_representation(skempi_vectors_path)
|
191 |
multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df)
|
192 |
model = linear_model.BayesianRidge()
|
193 |
-
|
194 |
-
result_df.to_csv(os.path.join(script_dir, r"../results/Affinity_prediction_skempiv1_{0}.csv".format(representation_name)),index=False)
|
195 |
-
result_detail_df.to_csv(os.path.join(script_dir, r"../results/Affinity_prediction_skempiv1_{0}_detail.csv".format(representation_name)),index=False)
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
from sklearn import linear_model
|
14 |
from sklearn.model_selection import KFold
|
15 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
|
|
16 |
from sklearn.preprocessing import MinMaxScaler
|
17 |
|
18 |
skempi_vectors_path = None
|
|
|
21 |
def load_representation(multi_col_representation_vector_file_path):
|
22 |
print("\nLoading representation vectors...\n")
|
23 |
multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
|
24 |
+
vals = multi_col_representation_vector.iloc[:, 1:(len(multi_col_representation_vector.columns))]
|
25 |
+
original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'), 'Vector': pd.Series([], dtype='object')})
|
26 |
+
for index, row in tqdm.tqdm(vals.iterrows(), total=len(vals)):
|
27 |
list_of_floats = [float(item) for item in list(row)]
|
28 |
original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
|
29 |
return original_values_as_df
|
30 |
|
31 |
def calc_train_error(X_train, y_train, model):
|
32 |
+
'''Returns in-sample error for an already fit model.'''
|
33 |
predictions = model.predict(X_train)
|
34 |
mse = mean_squared_error(y_train, predictions)
|
35 |
mae = mean_absolute_error(y_train, predictions)
|
36 |
corr = scipy.stats.pearsonr(y_train, predictions)
|
37 |
+
return mse, mae, corr
|
38 |
+
|
39 |
def calc_validation_error(X_test, y_test, model):
|
40 |
+
'''Returns out-of-sample error for an already fit model.'''
|
41 |
predictions = model.predict(X_test)
|
42 |
mse = mean_squared_error(y_test, predictions)
|
43 |
mae = mean_absolute_error(y_test, predictions)
|
44 |
corr = scipy.stats.pearsonr(y_test, predictions)
|
45 |
+
return mse, mae, corr
|
46 |
+
|
47 |
def calc_metrics(X_train, y_train, X_test, y_test, model):
|
48 |
+
'''Fits the model and returns the metrics for in-sample and out-of-sample errors.'''
|
49 |
model.fit(X_train, y_train)
|
50 |
+
train_mse_error, train_mae_error, train_corr = calc_train_error(X_train, y_train, model)
|
51 |
+
val_mse_error, val_mae_error, val_corr = calc_validation_error(X_test, y_test, model)
|
52 |
+
return train_mse_error, val_mse_error, train_mae_error, val_mae_error, train_corr, val_corr
|
53 |
|
54 |
def report_results(
|
55 |
train_mse_error_list,
|
|
|
61 |
train_corr_pval_list,
|
62 |
validation_corr_pval_list,
|
63 |
):
|
64 |
+
result_summary = {
|
65 |
+
"train_mse_error": round(np.mean(train_mse_error_list) * 100, 4),
|
66 |
+
"train_mse_std": round(np.std(train_mse_error_list) * 100, 4),
|
67 |
+
"val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4),
|
68 |
+
"val_mse_std": round(np.std(validation_mse_error_list) * 100, 4),
|
69 |
+
"train_mae_error": round(np.mean(train_mae_error_list) * 100, 4),
|
70 |
+
"train_mae_std": round(np.std(train_mae_error_list) * 100, 4),
|
71 |
+
"val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4),
|
72 |
+
"val_mae_std": round(np.std(validation_mae_error_list) * 100, 4),
|
73 |
+
"train_corr": round(np.mean(train_corr_list), 4),
|
74 |
+
"train_corr_pval": round(np.mean(train_corr_pval_list), 4),
|
75 |
+
"validation_corr": round(np.mean(validation_corr_list), 4),
|
76 |
+
"validation_corr_pval": round(np.mean(validation_corr_pval_list), 4),
|
77 |
+
}
|
78 |
+
|
79 |
+
result_detail = {
|
80 |
+
"train_mse_errors": list(np.multiply(train_mse_error_list, 100)),
|
81 |
+
"val_mse_errors": list(np.multiply(validation_mse_error_list, 100)),
|
82 |
+
"train_mae_errors": list(np.multiply(train_mae_error_list, 100)),
|
83 |
+
"val_mae_errors": list(np.multiply(validation_mae_error_list, 100)),
|
84 |
+
"train_corrs": list(np.multiply(train_corr_list, 100)),
|
85 |
+
"train_corr_pvals": list(np.multiply(train_corr_pval_list, 100)),
|
86 |
+
"validation_corrs": list(np.multiply(validation_corr_list, 100)),
|
87 |
+
"validation_corr_pvals": list(np.multiply(validation_corr_pval_list, 100)),
|
88 |
+
}
|
89 |
+
|
90 |
+
return result_summary, result_detail
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
def predictAffinityWithModel(regressor_model, multiplied_vectors_df):
|
93 |
K = 10
|
94 |
kf = KFold(n_splits=K, shuffle=True, random_state=42)
|
95 |
|
|
|
103 |
validation_corr_pval_list = []
|
104 |
|
105 |
data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float)
|
106 |
+
ppi_affinity_filtered_df = ppi_affinity_df[
|
107 |
+
ppi_affinity_df['Protein1'].isin(multiplied_vectors_df['Protein1']) &
|
108 |
+
ppi_affinity_df['Protein2'].isin(multiplied_vectors_df['Protein2'])
|
109 |
+
]
|
110 |
target = np.array(ppi_affinity_filtered_df["Affinity"])
|
111 |
scaler = MinMaxScaler()
|
112 |
scaler.fit(target.reshape(-1, 1))
|
113 |
target = scaler.transform(target.reshape(-1, 1))[:, 0]
|
114 |
+
|
115 |
for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):
|
116 |
|
117 |
# split data
|
|
|
119 |
y_train, y_val = target[train_index], target[val_index]
|
120 |
|
121 |
# instantiate model
|
122 |
+
reg = regressor_model
|
123 |
|
124 |
+
# calculate errors
|
125 |
(
|
126 |
train_mse_error,
|
127 |
val_mse_error,
|
|
|
131 |
val_corr,
|
132 |
) = calc_metrics(X_train, y_train, X_val, y_val, reg)
|
133 |
|
134 |
+
# append to appropriate lists
|
135 |
train_mse_error_list.append(train_mse_error)
|
136 |
validation_mse_error_list.append(val_mse_error)
|
137 |
|
|
|
157 |
|
158 |
ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"
|
159 |
ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path)
|
160 |
+
ppi_affinity_df = pd.read_csv(ppi_affinity_file, sep="\t", header=None)
|
161 |
ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']
|
162 |
|
|
|
|
|
163 |
def calculate_vector_multiplications(skempi_vectors_df):
|
164 |
+
multiplied_vectors = pd.DataFrame({
|
165 |
+
'Protein1': pd.Series([], dtype='str'),
|
166 |
+
'Protein2': pd.Series([], dtype='str'),
|
167 |
+
'Vector': pd.Series([], dtype='object')
|
168 |
+
})
|
169 |
print("Element-wise vector multiplications are being calculated")
|
170 |
rep_prot_list = list(skempi_vectors_df['PDB_ID'])
|
171 |
+
|
172 |
+
for index, row in tqdm.tqdm(ppi_affinity_df.iterrows()):
|
173 |
if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list:
|
174 |
+
vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein1']]['Vector'])[0]
|
175 |
+
vec2 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein2']]['Vector'])[0]
|
176 |
+
multiplied_vec = np.multiply(vec1, vec2)
|
177 |
+
|
178 |
+
multiplied_vectors = multiplied_vectors.append({
|
179 |
+
'Protein1': row['Protein1'],
|
180 |
+
'Protein2': row['Protein2'],
|
181 |
+
'Vector': multiplied_vec
|
182 |
+
}, ignore_index=True)
|
183 |
+
|
184 |
return multiplied_vectors
|
185 |
|
186 |
def predict_affinities_and_report_results():
|
187 |
skempi_vectors_df = load_representation(skempi_vectors_path)
|
188 |
multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df)
|
189 |
model = linear_model.BayesianRidge()
|
190 |
+
result_summary, result_detail = predictAffinityWithModel(model, multiplied_vectors_df)
|
|
|
|
|
191 |
|
192 |
+
# Return the results as a dictionary instead of writing to a file
|
193 |
+
return {
|
194 |
+
"summary": result_summary,
|
195 |
+
"detail": result_detail
|
196 |
+
}
|