File size: 4,829 Bytes
dd49f8a
 
00d5f68
 
 
dd49f8a
 
 
 
 
 
41f6a20
 
dd49f8a
41f6a20
dd49f8a
 
41f6a20
dd49f8a
41f6a20
dd49f8a
 
 
 
 
 
 
41f6a20
 
dd49f8a
 
 
 
 
41f6a20
 
dd49f8a
41f6a20
 
 
 
dd49f8a
41f6a20
 
 
dd49f8a
 
41f6a20
dd49f8a
 
 
41f6a20
 
 
0f597fc
41f6a20
 
dd49f8a
 
 
41f6a20
dd49f8a
 
 
 
41f6a20
0f597fc
41f6a20
794f79d
dd49f8a
41f6a20
dd49f8a
 
41f6a20
 
dd49f8a
 
 
 
0f597fc
dd49f8a
 
41f6a20
 
dd49f8a
 
 
 
 
41f6a20
 
dd49f8a
 
41f6a20
dd49f8a
41f6a20
dd49f8a
 
 
 
 
41f6a20
dd49f8a
 
 
81c47ae
dd49f8a
 
41f6a20
dd49f8a
41f6a20
 
81c47ae
 
 
548b502
81c47ae
 
 
 
41f6a20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# coding: utf-8
import os
script_dir = os.path.dirname(os.path.abspath(__file__))

import pandas as pd
import numpy as np
import itertools
import multiprocessing
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr
from tqdm import tqdm

manager = multiprocessing.Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

representation_dataframe = ""
protein_names = ""
representation_name = ""
similarity_tasks = ""
detailed_output = False

def parallelSimilarity(paramList):
    protein_embedding_dataframe = representation_dataframe
    i = paramList[0]
    j = paramList[1]
    if j > i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in protein_names and protein2 in protein_names:
            prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
            prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
            
            # Calculate Manhattan Distance and normalize
            manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
            manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1))
            manhattanSim = 1 - manhattanDistNorm.item()
            
            if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0:
                manhattanSim = 1.0

            real = paramList[2]
            similarity_list.append((real, manhattanSim))
    return similarity_list

def calculateCorrelationforOntology(aspect, matrix_type):
    similarity_list[:] = []
    proteinListNew[:] = []
    
    similarityMatrixNameDict = {
        "All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"),
        "500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
        "sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
        "200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv")
    }

    similarityMatrixFileName = similarityMatrixNameDict[matrix_type]
    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True)
    proteinList = human_proteinSimilarityMatrix.columns

    for prot in proteinList:
        proteinListNew.append(prot)

    if matrix_type == "sparse":
        sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load(sparsified_path)
        protParamList = sparsified_similarity_coordinates
    else:
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i, j))

    protParamListNew = []
    for tup in tqdm(protParamList):
        i = tup[0]
        j = tup[1]
        if matrix_type == "sparse":
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1, protein2]
            tupNew = (tup[0],tup[1],real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1, protein2]
                tupNew = (tup[0],tup[1],real)
                protParamListNew.append(tupNew)

    pool = multiprocessing.Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True):
        pass
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    manhattan_distance_list = [value[1] for value in similarity_listRet]

    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)

    return (manhattanCorr[0], manhattanCorr[1])

def calculate_all_correlations():
    results = {}
    for similarity_matrix_type in similarity_tasks:
        matrix_results = {}
        for aspect in ["MF", "BP", "CC"]:
            corr, p_value = calculateCorrelationforOntology(aspect, similarity_matrix_type)
            
            corr_key = f"{similarity_matrix_type}_{aspect}_correlation"
            p_value_key = f"{similarity_matrix_type}_{aspect}_pvalue"
            
            results[corr_key] = corr
            results[p_value_key] = p_value

    return results