PROBE / src /bin /semantic_similarity_infer.py
mgyigit's picture
Update src/bin/semantic_similarity_infer.py
81c47ae verified
raw
history blame
4.83 kB
#!/usr/bin/env python
# coding: utf-8
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
import pandas as pd
import numpy as np
import itertools
import multiprocessing
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr
from tqdm import tqdm
manager = multiprocessing.Manager()
similarity_list = manager.list()
proteinListNew = manager.list()
representation_dataframe = ""
protein_names = ""
representation_name = ""
similarity_tasks = ""
detailed_output = False
def parallelSimilarity(paramList):
protein_embedding_dataframe = representation_dataframe
i = paramList[0]
j = paramList[1]
if j > i:
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
if protein1 in protein_names and protein2 in protein_names:
prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
# Calculate Manhattan Distance and normalize
manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1))
manhattanSim = 1 - manhattanDistNorm.item()
if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0:
manhattanSim = 1.0
real = paramList[2]
similarity_list.append((real, manhattanSim))
return similarity_list
def calculateCorrelationforOntology(aspect, matrix_type):
similarity_list[:] = []
proteinListNew[:] = []
similarityMatrixNameDict = {
"All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"),
"500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
"Sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
"200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv")
}
similarityMatrixFileName = similarityMatrixNameDict[matrix_type]
human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True)
proteinList = human_proteinSimilarityMatrix.columns
for prot in proteinList:
proteinListNew.append(prot)
if matrix_type == "Sparse":
sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy")
sparsified_similarity_coordinates = np.load(sparsified_path)
protParamList = sparsified_similarity_coordinates
else:
i = range(len(proteinList))
j = range(len(proteinList))
protParamList = list(itertools.product(i, j))
protParamListNew = []
for tup in tqdm(protParamList):
i = tup[0]
j = tup[1]
if matrix_type == "Sparse":
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
real = human_proteinSimilarityMatrix.loc[protein1, protein2]
tupNew = (tup[0],tup[1],real)
protParamListNew.append(tupNew)
else:
if j > i:
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
real = human_proteinSimilarityMatrix.loc[protein1, protein2]
tupNew = (tup[0],tup[1],real)
protParamListNew.append(tupNew)
pool = multiprocessing.Pool()
similarity_listRet = []
for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True):
pass
pool.close()
pool.join()
real_distance_list = [value[0] for value in similarity_listRet]
manhattan_distance_list = [value[1] for value in similarity_listRet]
manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
return (manhattanCorr[0], manhattanCorr[1])
def calculate_all_correlations():
results = {}
for similarity_matrix_type in similarity_tasks:
matrix_results = {}
for aspect in ["MF", "BP", "CC"]:
corr, p_value = calculateCorrelationforOntology(aspect, similarity_matrix_type)
corr_key = f"{similarity_matrix_type}_{aspect}_correlation"
p_value_key = f"{similarity_matrix_type}_{aspect}_p_value"
results[corr_key] = corr
results[p_value_key] = p_value
return results