mgyigit commited on
Commit
41f6a20
·
verified ·
1 Parent(s): d3ffc4d

Update src/bin/semantic_similarity_infer.py

Browse files
Files changed (1) hide show
  1. src/bin/semantic_similarity_infer.py +46 -93
src/bin/semantic_similarity_infer.py CHANGED
@@ -1,32 +1,21 @@
1
  #!/usr/bin/env python
2
  # coding: utf-8
3
  import os
4
- script_dir = os.path.dirname(os.path.abspath(__file__))
5
-
6
  import pandas as pd
7
  import numpy as np
8
- import gzip
9
  import itertools
10
  import multiprocessing
11
- import csv
12
- import pickle
13
- import random
14
- from sklearn.metrics.pairwise import cosine_similarity as cosine
15
- from sklearn.metrics import mean_squared_error as mse
16
- from tqdm import tqdm, tqdm_notebook
17
- from multiprocessing import Manager, Pool
18
  from scipy.spatial.distance import cdist
19
  from numpy.linalg import norm
20
- from scipy.stats import spearmanr, pearsonr
21
- from functools import partial
22
 
23
- manager = Manager()
24
  similarity_list = manager.list()
25
  proteinListNew = manager.list()
26
-
27
  representation_dataframe = ""
28
- protein_names = ""
29
- # define similarity_list and proteinList as global variables
30
  representation_name = ""
31
  similarity_tasks = ""
32
  detailed_output = False
@@ -34,130 +23,94 @@ detailed_output = False
34
  def parallelSimilarity(paramList):
35
  protein_embedding_dataframe = representation_dataframe
36
  i = paramList[0]
37
- j = paramList[1]
38
- aspect = paramList[2]
39
- if j>i:
40
  protein1 = proteinListNew[i]
41
  protein2 = proteinListNew[j]
42
  if protein1 in protein_names and protein2 in protein_names:
43
  prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
44
  prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
45
- #cosine will return in shape of input vectors first dimension
46
- cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
47
  manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
48
- manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
49
- manhattanSim = 1-manhattanDistNorm.item()
50
- if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0):
 
51
  manhattanSim = 1.0
52
- #print((protein1,protein2))
53
- #print(manhattanDist)
54
- #print(norm(prot1vec,1))
55
- #print(norm(prot2vec,1))
56
- euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
57
- euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2))
58
- euclidianSim = 1-euclideanDistNorm.item()
59
- if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0):
60
- euclidianSim = 1.0
61
- real = paramList[3]
62
- # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
63
- similarity_list.append((real,cos,manhattanSim ,euclidianSim))
64
  return similarity_list
65
 
66
- def calculateCorrelationforOntology(aspect,matrix_type):
67
- print("\n\nSemantic similarity correlation calculation for aspect: " + aspect + " using matrix/dataset: " + matrix_type + " ...\n")
68
- #Clear lists before each aspect
69
  similarity_list[:] = []
70
  proteinListNew[:] = []
71
 
72
- similarityMatrixNameDict = {}
73
- similarityMatrixNameDict["All"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix.csv")
74
- similarityMatrixNameDict["500"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv")
75
- similarityMatrixNameDict["Sparse"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv")
76
- similarityMatrixNameDict["200"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv")
 
77
 
78
  similarityMatrixFileName = similarityMatrixNameDict[matrix_type]
79
-
80
  human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
81
- human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
82
  proteinList = human_proteinSimilarityMatrix.columns
83
 
84
- #proteinListNew is referanced using Manager
85
  for prot in proteinList:
86
  proteinListNew.append(prot)
 
87
  if matrix_type == "Sparse":
88
- #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
89
- sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
90
  sparsified_similarity_coordinates = np.load(sparsified_path)
91
  protParamList = sparsified_similarity_coordinates
92
- else:
93
  i = range(len(proteinList))
94
  j = range(len(proteinList))
95
- protParamList = list(itertools.product(i,j))
 
96
  protParamListNew = []
97
- # Prepare parameters for parallel processing these parameters will be
98
- # used concurrently by different processes
99
  for tup in tqdm(protParamList):
100
  i = tup[0]
101
  j = tup[1]
102
-
103
  if matrix_type == "Sparse":
104
  protein1 = proteinListNew[i]
105
  protein2 = proteinListNew[j]
106
- real = human_proteinSimilarityMatrix.loc[protein1,protein2]
107
- tupNew = (tup[0],tup[1],aspect,real)
108
  protParamListNew.append(tupNew)
109
  else:
110
  if j > i:
111
  protein1 = proteinListNew[i]
112
  protein2 = proteinListNew[j]
113
- real = human_proteinSimilarityMatrix.loc[protein1,protein2]
114
- tupNew = (tup[0],tup[1],aspect,real)
115
  protParamListNew.append(tupNew)
116
 
117
- total_task_num=len(protParamListNew)
118
- pool = Pool()
119
  similarity_listRet = []
120
- #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type)
121
- for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity,protParamListNew), total=total_task_num , position=0, leave=True ):
122
  pass
123
- #time.sleep(0.1)
124
  pool.close()
125
  pool.join()
126
 
127
  real_distance_list = [value[0] for value in similarity_listRet]
128
- cosine_distance_list = [value[1] for value in similarity_listRet]
129
- manhattan_distance_list = [value[2] for value in similarity_listRet]
130
- euclidian_distance_list = [value[3] for value in similarity_listRet]
131
 
132
- distance_lists = [real_distance_list,cosine_distance_list,manhattan_distance_list,euclidian_distance_list]
133
- if detailed_output:
134
- report_detailed_distance_scores(representation_name,matrix_type,aspect,distance_lists)
135
-
136
- cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
137
  manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
138
- euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)
139
-
140
- #print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
141
- #print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
142
- #print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))
143
-
144
- return (cosineCorr,manhattanCorr,euclidianCorr)
145
 
146
- def report_detailed_distance_scores(representation_name,similarity_matrix_type,aspect,distance_lists):
147
- saveFileName = os.path.join(script_dir, "../results/Semantic_sim_inference_detailed_distance_scores"+aspect+"_"+similarity_matrix_type+"_"+representation_name+".pkl")
148
- with open(saveFileName, "wb") as f:
149
- pickle.dump(distance_lists, f)
150
 
151
  def calculate_all_correlations():
 
152
  for similarity_matrix_type in similarity_tasks:
153
- saveFileName = os.path.join(script_dir, "../results/Semantic_sim_inference_"+similarity_matrix_type+"_"+representation_name+".csv")
154
- buffer = "Semantic Aspect,CosineSim_Correlation,CosineSim_Correlation p-value, ManhattanSim_Correlation,ManhattanSim_Correlation p-value, EuclidianSim_Correlation,EuclidianSim_Correlation p-value \n"
155
- f = open(saveFileName,'w')
156
- f.write(buffer)
157
- for aspect in ["MF","BP","CC"]:
158
- corr = calculateCorrelationforOntology(aspect,similarity_matrix_type)
159
- buffer = "" + aspect + ","+ str(round(corr[0][0],5))+ ","+ str(round(corr[0][1],5))+ ","+ str(round(corr[1][0],5))\
160
- + ","+ str(round(corr[1][1],5))+ ","+ str(round(corr[2][0],5))+ ","+str(round(corr[2][1],5))+"\n"
161
- f = open(saveFileName,'a')
162
- f.write(buffer)
163
- f.close()
 
1
  #!/usr/bin/env python
2
  # coding: utf-8
3
  import os
 
 
4
  import pandas as pd
5
  import numpy as np
 
6
  import itertools
7
  import multiprocessing
 
 
 
 
 
 
 
8
  from scipy.spatial.distance import cdist
9
  from numpy.linalg import norm
10
+ from scipy.stats import spearmanr
11
+ from tqdm import tqdm
12
 
13
+ manager = multiprocessing.Manager()
14
  similarity_list = manager.list()
15
  proteinListNew = manager.list()
16
+
17
  representation_dataframe = ""
18
+ protein_names = ""
 
19
  representation_name = ""
20
  similarity_tasks = ""
21
  detailed_output = False
 
23
  def parallelSimilarity(paramList):
24
  protein_embedding_dataframe = representation_dataframe
25
  i = paramList[0]
26
+ j = paramList[1]
27
+ if j > i:
 
28
  protein1 = proteinListNew[i]
29
  protein2 = proteinListNew[j]
30
  if protein1 in protein_names and protein2 in protein_names:
31
  prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
32
  prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
33
+
34
+ # Calculate Manhattan Distance and normalize
35
  manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
36
+ manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1))
37
+ manhattanSim = 1 - manhattanDistNorm.item()
38
+
39
+ if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0:
40
  manhattanSim = 1.0
41
+
42
+ real = paramList[2]
43
+ similarity_list.append((real, manhattanSim))
 
 
 
 
 
 
 
 
 
44
  return similarity_list
45
 
46
+ def calculateCorrelationforOntology(aspect, matrix_type):
 
 
47
  similarity_list[:] = []
48
  proteinListNew[:] = []
49
 
50
+ similarityMatrixNameDict = {
51
+ "All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"),
52
+ "500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
53
+ "Sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
54
+ "200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv")
55
+ }
56
 
57
  similarityMatrixFileName = similarityMatrixNameDict[matrix_type]
 
58
  human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
59
+ human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True)
60
  proteinList = human_proteinSimilarityMatrix.columns
61
 
 
62
  for prot in proteinList:
63
  proteinListNew.append(prot)
64
+
65
  if matrix_type == "Sparse":
66
+ sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy")
 
67
  sparsified_similarity_coordinates = np.load(sparsified_path)
68
  protParamList = sparsified_similarity_coordinates
69
+ else:
70
  i = range(len(proteinList))
71
  j = range(len(proteinList))
72
+ protParamList = list(itertools.product(i, j))
73
+
74
  protParamListNew = []
 
 
75
  for tup in tqdm(protParamList):
76
  i = tup[0]
77
  j = tup[1]
 
78
  if matrix_type == "Sparse":
79
  protein1 = proteinListNew[i]
80
  protein2 = proteinListNew[j]
81
+ real = human_proteinSimilarityMatrix.loc[protein1, protein2]
82
+ tupNew = (tup[0],tup[1],real)
83
  protParamListNew.append(tupNew)
84
  else:
85
  if j > i:
86
  protein1 = proteinListNew[i]
87
  protein2 = proteinListNew[j]
88
+ real = human_proteinSimilarityMatrix.loc[protein1, protein2]
89
+ tupNew = (tup[0],tup[1],real)
90
  protParamListNew.append(tupNew)
91
 
92
+ pool = multiprocessing.Pool()
 
93
  similarity_listRet = []
94
+ for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True):
 
95
  pass
 
96
  pool.close()
97
  pool.join()
98
 
99
  real_distance_list = [value[0] for value in similarity_listRet]
100
+ manhattan_distance_list = [value[1] for value in similarity_listRet]
 
 
101
 
 
 
 
 
 
102
  manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
 
 
 
 
 
 
 
103
 
104
+ return {
105
+ "correlation": manhattanCorr[0], "p_value": manhattanCorr[1]
106
+ }
 
107
 
108
  def calculate_all_correlations():
109
+ results = {}
110
  for similarity_matrix_type in similarity_tasks:
111
+ matrix_results = {}
112
+ for aspect in ["MF", "BP", "CC"]:
113
+ corr = calculateCorrelationforOntology(aspect, similarity_matrix_type)
114
+ matrix_results[aspect] = corr
115
+ results[similarity_matrix_type] = matrix_results
116
+ return results