Spaces:
Sleeping
Sleeping
File size: 4,829 Bytes
dd49f8a 00d5f68 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 0f597fc 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 0f597fc 41f6a20 794f79d dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 0f597fc dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 41f6a20 dd49f8a 81c47ae dd49f8a 41f6a20 dd49f8a 41f6a20 81c47ae 548b502 81c47ae 41f6a20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
#!/usr/bin/env python
# coding: utf-8
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
import pandas as pd
import numpy as np
import itertools
import multiprocessing
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr
from tqdm import tqdm
manager = multiprocessing.Manager()
similarity_list = manager.list()
proteinListNew = manager.list()
representation_dataframe = ""
protein_names = ""
representation_name = ""
similarity_tasks = ""
detailed_output = False
def parallelSimilarity(paramList):
protein_embedding_dataframe = representation_dataframe
i = paramList[0]
j = paramList[1]
if j > i:
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
if protein1 in protein_names and protein2 in protein_names:
prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
# Calculate Manhattan Distance and normalize
manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1))
manhattanSim = 1 - manhattanDistNorm.item()
if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0:
manhattanSim = 1.0
real = paramList[2]
similarity_list.append((real, manhattanSim))
return similarity_list
def calculateCorrelationforOntology(aspect, matrix_type):
similarity_list[:] = []
proteinListNew[:] = []
similarityMatrixNameDict = {
"All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"),
"500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
"sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
"200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv")
}
similarityMatrixFileName = similarityMatrixNameDict[matrix_type]
human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True)
proteinList = human_proteinSimilarityMatrix.columns
for prot in proteinList:
proteinListNew.append(prot)
if matrix_type == "sparse":
sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy")
sparsified_similarity_coordinates = np.load(sparsified_path)
protParamList = sparsified_similarity_coordinates
else:
i = range(len(proteinList))
j = range(len(proteinList))
protParamList = list(itertools.product(i, j))
protParamListNew = []
for tup in tqdm(protParamList):
i = tup[0]
j = tup[1]
if matrix_type == "sparse":
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
real = human_proteinSimilarityMatrix.loc[protein1, protein2]
tupNew = (tup[0],tup[1],real)
protParamListNew.append(tupNew)
else:
if j > i:
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
real = human_proteinSimilarityMatrix.loc[protein1, protein2]
tupNew = (tup[0],tup[1],real)
protParamListNew.append(tupNew)
pool = multiprocessing.Pool()
similarity_listRet = []
for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True):
pass
pool.close()
pool.join()
real_distance_list = [value[0] for value in similarity_listRet]
manhattan_distance_list = [value[1] for value in similarity_listRet]
manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
return (manhattanCorr[0], manhattanCorr[1])
def calculate_all_correlations():
results = {}
for similarity_matrix_type in similarity_tasks:
matrix_results = {}
for aspect in ["MF", "BP", "CC"]:
corr, p_value = calculateCorrelationforOntology(aspect, similarity_matrix_type)
corr_key = f"{similarity_matrix_type}_{aspect}_correlation"
p_value_key = f"{similarity_matrix_type}_{aspect}_pvalue"
results[corr_key] = corr
results[p_value_key] = p_value
return results |