mgyigit commited on
Commit
81b1545
·
verified ·
1 Parent(s): 81c47ae

Update src/bin/function_predictor.py

Browse files
Files changed (1) hide show
  1. src/bin/function_predictor.py +75 -158
src/bin/function_predictor.py CHANGED
@@ -1,12 +1,10 @@
1
  # -*- coding: utf-8 -*-
2
- import os
3
  script_dir = os.path.dirname(os.path.abspath(__file__))
4
 
5
  import pandas as pd
6
  import numpy as np
7
  from datetime import datetime
8
- import pickle
9
- import os
10
  import multiprocessing
11
  from tqdm import tqdm
12
 
@@ -16,7 +14,6 @@ from sklearn.model_selection import cross_val_predict, KFold
16
  from skmultilearn.problem_transform import BinaryRelevance
17
  from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss
18
 
19
-
20
  aspect_type = ""
21
  dataset_type = ""
22
  representation_dataframe = ""
@@ -32,94 +29,69 @@ def check_for_at_least_two_class_sample_exits(y):
32
  for column in y:
33
  column_sum = np.sum(y[column].array)
34
  if column_sum < 2:
35
- print('At least 2 positive samples are required for each class {0} class has {1} positive samples'.format(column,column_sum))
36
  return False
37
  return True
38
 
39
- def create_valid_kfold_object_for_multilabel_splits(X,y,kf):
40
- check_for_at_least_two_class_sample_exits(y)
41
- sample_class_occurance = dict(zip(y.columns,np.zeros(len(y.columns))))
 
 
42
  for column in y:
43
- for fold_train_index,fold_test_index in kf.split(X,y):
44
- fold_col_sum = np.sum(y.iloc[fold_test_index,:][column].array)
45
  if fold_col_sum > 0:
46
- sample_class_occurance[column] += 1
47
 
48
- for key in sample_class_occurance:
49
- value = sample_class_occurance[key]
50
  if value < 2:
51
  random_state = np.random.randint(1000)
52
- print("Random state is changed since at least two positive samples are required in different train/test folds.\
53
- \nHowever, only one fold exits with positive samples for class {0}".format(key))
54
- print("Selected random state is {0}".format(random_state))
55
  kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
56
- create_valid_kfold_object_for_multilabel_splits(X,y,kf)
57
- else:
58
- return kf
59
 
60
  def MultiLabelSVC_cross_val_predict(representation_name, dataset, X, y, classifier):
61
- #dataset split, estimator, cv
62
  clf = classifier
63
- Xn = np.array(np.asarray(X.values.tolist()), dtype=float)
64
  kf_init = KFold(n_splits=5, shuffle=True, random_state=42)
65
- kf = create_valid_kfold_object_for_multilabel_splits(X,y,kf_init)
 
 
 
66
  y_pred = cross_val_predict(clf, Xn, y, cv=kf)
67
 
68
- if detailed_output:
69
- ont_path = r"../results/Ontology_based_function_prediction_{1}_{0}_model.pkl".format(representation_name,dataset.split(".")[0])
70
- with open(os.path.join(script_dir, ont_path),"wb") as file:
71
- pickle.dump(clf,file)
72
-
73
- acc_cv = []
74
- f1_mi_cv = []
75
- f1_ma_cv = []
76
- f1_we_cv = []
77
- pr_mi_cv = []
78
- pr_ma_cv = []
79
- pr_we_cv = []
80
- rc_mi_cv = []
81
- rc_ma_cv = []
82
- rc_we_cv = []
83
  hamm_cv = []
84
- for fold_train_index,fold_test_index in kf.split(X,y):
85
- acc = accuracy_score(y.iloc[fold_test_index,:],y_pred[fold_test_index])
86
- acc_cv.append(np.round(acc,decimals=5))
87
- f1_mi = f1_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="micro")
88
- f1_mi_cv.append(np.round(f1_mi,decimals=5))
89
- f1_ma = f1_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="macro")
90
- f1_ma_cv.append(np.round(f1_ma,decimals=5))
91
- f1_we = f1_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="weighted")
92
- f1_we_cv.append(np.round(f1_we,decimals=5))
93
- pr_mi = precision_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="micro")
94
- pr_mi_cv.append(np.round(pr_mi,decimals=5))
95
- pr_ma = precision_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="macro")
96
- pr_ma_cv.append(np.round(pr_ma,decimals=5))
97
- pr_we = precision_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="weighted")
98
- pr_we_cv.append(np.round(pr_we,decimals=5))
99
- rc_mi = recall_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="micro")
100
- rc_mi_cv.append(np.round(rc_mi,decimals=5))
101
- rc_ma = recall_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="macro")
102
- rc_ma_cv.append(np.round(rc_ma,decimals=5))
103
- rc_we = recall_score(y.iloc[fold_test_index,:],y_pred[fold_test_index],average="weighted")
104
- rc_we_cv.append(np.round(rc_we,decimals=5))
105
- hamm = hamming_loss(y.iloc[fold_test_index,:],y_pred[fold_test_index])
106
- hamm_cv.append(np.round(hamm,decimals=5))
107
-
108
- means = list(np.mean([acc_cv,f1_mi_cv,f1_ma_cv,f1_we_cv,pr_mi_cv,pr_ma_cv,pr_we_cv,rc_mi_cv,rc_ma_cv,rc_we_cv,hamm_cv], axis=1))
109
- means = [np.round(i,decimals=5) for i in means]
110
-
111
- stds = list(np.std([acc_cv,f1_mi_cv,f1_ma_cv,f1_we_cv,pr_mi_cv,pr_ma_cv,pr_we_cv,rc_mi_cv,rc_ma_cv,rc_we_cv,hamm_cv], axis=1))
112
- stds = [np.round(i,decimals=5) for i in stds]
113
-
114
- return ([representation_name+"_"+dataset,acc_cv,f1_mi_cv,f1_ma_cv,f1_we_cv,pr_mi_cv,pr_ma_cv,pr_we_cv,rc_mi_cv,rc_ma_cv,rc_we_cv,hamm_cv],\
115
- [representation_name+"_"+dataset]+means,\
116
- [representation_name+"_"+dataset]+stds,\
117
- y_pred)
118
-
119
- def ProtDescModel():
120
- #desc_file = pd.read_csv(r"protein_representations\final\{0}_dim{1}.tsv".format(representation_name,desc_dim),sep="\t")
121
  datasets = os.listdir(os.path.join(script_dir, r"../data/auxilary_input/GO_datasets"))
122
- if dataset_type == "All_Data_Sets" and aspect_type == "All_Aspects":
 
123
  filtered_datasets = datasets
124
  elif dataset_type == "All_Data_Sets":
125
  filtered_datasets = [dataset for dataset in datasets if aspect_type in dataset]
@@ -127,94 +99,39 @@ def ProtDescModel():
127
  filtered_datasets = [dataset for dataset in datasets if dataset_type in dataset]
128
  else:
129
  filtered_datasets = [dataset for dataset in datasets if aspect_type in dataset and dataset_type in dataset]
 
130
  cv_results = []
131
- cv_mean_results = []
132
- cv_std_results = []
133
 
134
- for dt in tqdm(filtered_datasets,total=len(filtered_datasets)):
135
- print(r"Protein function prediction is started for the dataset: {0}".format(dt.split(".")[0]))
136
- dt_file = pd.read_csv(os.path.join(script_dir, r"../data/auxilary_input/GO_datasets/{0}".format(dt)),sep="\t")
137
- dt_merge = dt_file.merge(representation_dataframe,left_on="Protein_Id",right_on="Entry")
138
 
139
  dt_X = dt_merge['Vector']
140
- dt_y = dt_merge.iloc[:,1:-2]
141
- if check_for_at_least_two_class_sample_exits(dt_y) == False:
142
- print(r"No funtion will be predicted for the dataset: {0}".format(dt.split(".")[0]))
143
  continue
144
- #print("raw dt vs. dt_merge: {} - {}".format(len(dt_file),len(dt_merge)))
145
- #print("Calculating predictions for " + dt.split(".")[0])
146
- #model = MultiLabelSVC_cross_val_predict(representation_name, dt.split(".")[0], dt_X, dt_y, classifier=BinaryRelevance(SVC(kernel="linear", random_state=42)))
147
- cpu_number = multiprocessing.cpu_count()
148
- model = MultiLabelSVC_cross_val_predict(representation_name, dt.split(".")[0], dt_X, dt_y, classifier=BinaryRelevance(SGDClassifier(n_jobs=cpu_number, random_state=42)))
149
- cv_results.append(model[0])
150
- cv_mean_results.append(model[1])
151
- cv_std_results.append(model[2])
152
-
153
- predictions = dt_merge.iloc[:,:6]
154
- predictions["predicted_values"] = list(model[3].toarray())
155
- if detailed_output:
156
- predictions.to_csv(os.path.join(script_dir, r"../results/Ontology_based_function_prediction_{1}_{0}_predictions.tsv".format(representation_name,dt.split(".")[0])),sep="\t",index=None)
157
-
158
- return (cv_results, cv_mean_results,cv_std_results)
159
-
160
- #def pred_output(representation_name, desc_dim):
161
  def pred_output():
162
  model = ProtDescModel()
163
- cv_result = model[0]
164
- df_cv_result = pd.DataFrame({"Model": pd.Series([], dtype='str') ,"Accuracy": pd.Series([], dtype='float'),"F1_Micro": pd.Series([], dtype='float'),\
165
- "F1_Macro": pd.Series([], dtype='float'),"F1_Weighted": pd.Series([], dtype='float'),"Precision_Micro": pd.Series([], dtype='float'),\
166
- "Precision_Macro": pd.Series([], dtype='float'),"Precision_Weighted": pd.Series([], dtype='float'),"Recall_Micro": pd.Series([], dtype='float'),\
167
- "Recall_Macro": pd.Series([], dtype='float'),"Recall_Weighted": pd.Series([], dtype='float'),"Hamming_Distance": pd.Series([], dtype='float')})
168
- for i in cv_result:
169
- df_cv_result.loc[len(df_cv_result)] = i
170
- if detailed_output:
171
- df_cv_result.to_csv(os.path.join(script_dir, r"../results/Ontology_based_function_prediction_5cv_{0}.tsv".format(representation_name)),sep="\t",index=None)
172
-
173
- cv_mean_result = model[1]
174
- df_cv_mean_result = pd.DataFrame({"Model": pd.Series([], dtype='str') ,"Accuracy": pd.Series([], dtype='float'),"F1_Micro": pd.Series([], dtype='float'),\
175
- "F1_Macro": pd.Series([], dtype='float'),"F1_Weighted": pd.Series([], dtype='float'),"Precision_Micro": pd.Series([], dtype='float'),\
176
- "Precision_Macro": pd.Series([], dtype='float'),"Precision_Weighted": pd.Series([], dtype='float'),"Recall_Micro": pd.Series([], dtype='float'),\
177
- "Recall_Macro": pd.Series([], dtype='float'),"Recall_Weighted": pd.Series([], dtype='float'),"Hamming_Distance": pd.Series([], dtype='float')})
178
-
179
-
180
- #pd.DataFrame(columns=["Model","Accuracy","F1_Micro","F1_Macro","F1_Weighted","Precision_Micro","Precision_Macro","Precision_Weighted",\
181
- # "Recall_Micro","Recall_Macro","Recall_Weighted","Hamming_Distance"])
182
-
183
- for j in cv_mean_result:
184
- df_cv_mean_result.loc[len(df_cv_mean_result)] = j
185
- df_cv_mean_result.to_csv(os.path.join(script_dir, r"../results/Ontology_based_function_prediction_5cv_mean_{0}.tsv".format(representation_name)),sep="\t",index=None)
186
-
187
- #save std deviation of scores to file
188
- cv_std_result = model[2]
189
- df_cv_std_result = pd.DataFrame({"Model": pd.Series([], dtype='str') ,"Accuracy": pd.Series([], dtype='float'),"F1_Micro": pd.Series([], dtype='float'),\
190
- "F1_Macro": pd.Series([], dtype='float'),"F1_Weighted": pd.Series([], dtype='float'),"Precision_Micro": pd.Series([], dtype='float'),\
191
- "Precision_Macro": pd.Series([], dtype='float'),"Precision_Weighted": pd.Series([], dtype='float'),"Recall_Micro": pd.Series([], dtype='float'),\
192
- "Recall_Macro": pd.Series([], dtype='float'),"Recall_Weighted": pd.Series([], dtype='float'),"Hamming_Distance": pd.Series([], dtype='float')})
193
-
194
-
195
- #pd.DataFrame(columns=["Model","Accuracy","F1_Micro","F1_Macro","F1_Weighted","Precision_Micro","Precision_Macro","Precision_Weighted",\
196
- # "Recall_Micro","Recall_Macro","Recall_Weighted","Hamming_Distance"])
197
-
198
- for k in cv_std_result:
199
- df_cv_std_result.loc[len(df_cv_std_result)] = k
200
- df_cv_std_result.to_csv(os.path.join(script_dir, r"../results/Ontology_based_function_prediction_5cv_std_{0}.tsv".format(representation_name)),sep="\t",index=None)
201
-
202
- print(datetime.now())
203
-
204
-
205
- # tcga = pred_output("tcga","50")
206
- # protvec = pred_output("protvec","100")
207
- # unirep = pred_output("unirep","5700")
208
- # gene2vec = pred_output("gene2vec","200")
209
- # learned_embed = pred_output("learned_embed","64")
210
- # mut2vec = pred_output("mut2vec","300")
211
- # seqvec = pred_output("seqvec","1024")
212
-
213
- #bepler = pred_output("bepler","100")
214
- # resnet_rescaled = pred_output("resnet-rescaled","256")
215
- # transformer_avg = pred_output("transformer","768")
216
- # transformer_pool = pred_output("transformer-pool","768")
217
-
218
- # apaac = pred_output("apaac","80")
219
- #ksep = pred_output("ksep","400")
220
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
+ import os
3
  script_dir = os.path.dirname(os.path.abspath(__file__))
4
 
5
  import pandas as pd
6
  import numpy as np
7
  from datetime import datetime
 
 
8
  import multiprocessing
9
  from tqdm import tqdm
10
 
 
14
  from skmultilearn.problem_transform import BinaryRelevance
15
  from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss
16
 
 
17
  aspect_type = ""
18
  dataset_type = ""
19
  representation_dataframe = ""
 
29
  for column in y:
30
  column_sum = np.sum(y[column].array)
31
  if column_sum < 2:
32
+ print('At least 2 positive samples are required for each class {0} class has {1} positive samples'.format(column, column_sum))
33
  return False
34
  return True
35
 
36
+ def create_valid_kfold_object_for_multilabel_splits(X, y, kf):
37
+ if not check_for_at_least_two_class_sample_exits(y):
38
+ return None
39
+
40
+ sample_class_occurance = dict(zip(y.columns, np.zeros(len(y.columns))))
41
  for column in y:
42
+ for fold_train_index, fold_test_index in kf.split(X, y):
43
+ fold_col_sum = np.sum(y.iloc[fold_test_index, :][column].array)
44
  if fold_col_sum > 0:
45
+ sample_class_occurance[column] += 1
46
 
47
+ for key, value in sample_class_occurance.items():
 
48
  if value < 2:
49
  random_state = np.random.randint(1000)
50
+ print(f"Random state is changed since at least two positive samples are required in different train/test folds. "
51
+ f"However, only one fold exists with positive samples for class {key}")
52
+ print(f"Selected random state is {random_state}")
53
  kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
54
+ return create_valid_kfold_object_for_multilabel_splits(X, y, kf)
55
+ return kf
 
56
 
57
  def MultiLabelSVC_cross_val_predict(representation_name, dataset, X, y, classifier):
 
58
  clf = classifier
59
+ Xn = np.array(X.tolist(), dtype=float)
60
  kf_init = KFold(n_splits=5, shuffle=True, random_state=42)
61
+ kf = create_valid_kfold_object_for_multilabel_splits(X, y, kf_init)
62
+ if kf is None:
63
+ return None
64
+
65
  y_pred = cross_val_predict(clf, Xn, y, cv=kf)
66
 
67
+ acc_cv, f1_mi_cv, f1_ma_cv, f1_we_cv = [], [], [], []
68
+ pr_mi_cv, pr_ma_cv, pr_we_cv = [], [], []
69
+ rc_mi_cv, rc_ma_cv, rc_we_cv = [], [], []
 
 
 
 
 
 
 
 
 
 
 
 
70
  hamm_cv = []
71
+
72
+ for fold_train_index, fold_test_index in kf.split(X, y):
73
+ acc = accuracy_score(y.iloc[fold_test_index, :], y_pred[fold_test_index])
74
+ acc_cv.append(np.round(acc, decimals=5))
75
+ f1_mi_cv.append(np.round(f1_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="micro"), decimals=5))
76
+ f1_ma_cv.append(np.round(f1_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="macro"), decimals=5))
77
+ f1_we_cv.append(np.round(f1_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="weighted"), decimals=5))
78
+ pr_mi_cv.append(np.round(precision_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="micro"), decimals=5))
79
+ pr_ma_cv.append(np.round(precision_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="macro"), decimals=5))
80
+ pr_we_cv.append(np.round(precision_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="weighted"), decimals=5))
81
+ rc_mi_cv.append(np.round(recall_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="micro"), decimals=5))
82
+ rc_ma_cv.append(np.round(recall_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="macro"), decimals=5))
83
+ rc_we_cv.append(np.round(recall_score(y.iloc[fold_test_index, :], y_pred[fold_test_index], average="weighted"), decimals=5))
84
+ hamm_cv.append(np.round(hamming_loss(y.iloc[fold_test_index, :], y_pred[fold_test_index]), decimals=5))
85
+
86
+ return {
87
+ "cv_results": [representation_name + "_" + dataset, acc_cv, f1_mi_cv, f1_ma_cv, f1_we_cv, pr_mi_cv, pr_ma_cv, pr_we_cv, rc_mi_cv, rc_ma_cv, rc_we_cv, hamm_cv],
88
+ "predictions": y_pred
89
+ }
90
+
91
+ def ProtDescModel():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  datasets = os.listdir(os.path.join(script_dir, r"../data/auxilary_input/GO_datasets"))
93
+
94
+ if dataset_type == "All_Data_Sets" and aspect_type == "All_Aspects":
95
  filtered_datasets = datasets
96
  elif dataset_type == "All_Data_Sets":
97
  filtered_datasets = [dataset for dataset in datasets if aspect_type in dataset]
 
99
  filtered_datasets = [dataset for dataset in datasets if dataset_type in dataset]
100
  else:
101
  filtered_datasets = [dataset for dataset in datasets if aspect_type in dataset and dataset_type in dataset]
102
+
103
  cv_results = []
 
 
104
 
105
+ for dt in tqdm(filtered_datasets, total=len(filtered_datasets)):
106
+ print(f"Protein function prediction is started for the dataset: {dt.split('.')[0]}")
107
+ dt_file = pd.read_csv(os.path.join(script_dir, f"../data/auxilary_input/GO_datasets/{dt}"), sep="\t")
108
+ dt_merge = dt_file.merge(representation_dataframe, left_on="Protein_Id", right_on="Entry")
109
 
110
  dt_X = dt_merge['Vector']
111
+ dt_y = dt_merge.iloc[:, 1:-2]
112
+ if not check_for_at_least_two_class_sample_exits(dt_y):
113
+ print(f"No function will be predicted for the dataset: {dt.split('.')[0]}")
114
  continue
115
+
116
+ cpu_number = multiprocessing.cpu_count()
117
+ model = MultiLabelSVC_cross_val_predict(representation_name, dt.split(".")[0], dt_X, dt_y,
118
+ classifier=BinaryRelevance(SGDClassifier(n_jobs=cpu_number, random_state=42)))
119
+
120
+ if model is not None:
121
+ cv_results.append(model["cv_results"])
122
+
123
+ return {
124
+ "cv_results": cv_results
125
+ }
126
+
 
 
 
 
 
127
  def pred_output():
128
  model = ProtDescModel()
129
+ cv_result = model["cv_results"]
130
+
131
+ return {
132
+ "cv_result": cv_result
133
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # Example call to the function
136
+ # result = pred_output()
137
+ print(datetime.now())