Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
import os | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
import pandas as pd | |
import numpy as np | |
import itertools | |
import multiprocessing | |
from scipy.spatial.distance import cdist | |
from numpy.linalg import norm | |
from scipy.stats import spearmanr | |
from tqdm import tqdm | |
manager = multiprocessing.Manager() | |
similarity_list = manager.list() | |
proteinListNew = manager.list() | |
representation_dataframe = "" | |
protein_names = "" | |
representation_name = "" | |
similarity_tasks = "" | |
detailed_output = False | |
def parallelSimilarity(paramList): | |
protein_embedding_dataframe = representation_dataframe | |
i = paramList[0] | |
j = paramList[1] | |
if j > i: | |
protein1 = proteinListNew[i] | |
protein2 = proteinListNew[j] | |
if protein1 in protein_names and protein2 in protein_names: | |
prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item()) | |
prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item()) | |
# Calculate Manhattan Distance and normalize | |
manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock') | |
manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1)) | |
manhattanSim = 1 - manhattanDistNorm.item() | |
if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0: | |
manhattanSim = 1.0 | |
real = paramList[2] | |
similarity_list.append((real, manhattanSim)) | |
return similarity_list | |
def calculateCorrelationforOntology(aspect, matrix_type): | |
similarity_list[:] = [] | |
proteinListNew[:] = [] | |
similarityMatrixNameDict = { | |
"All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"), | |
"500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), | |
"sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), | |
"200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv") | |
} | |
similarityMatrixFileName = similarityMatrixNameDict[matrix_type] | |
human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName) | |
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True) | |
proteinList = human_proteinSimilarityMatrix.columns | |
for prot in proteinList: | |
proteinListNew.append(prot) | |
if matrix_type == "sparse": | |
sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy") | |
sparsified_similarity_coordinates = np.load(sparsified_path) | |
protParamList = sparsified_similarity_coordinates | |
else: | |
i = range(len(proteinList)) | |
j = range(len(proteinList)) | |
protParamList = list(itertools.product(i, j)) | |
protParamListNew = [] | |
for tup in tqdm(protParamList): | |
i = tup[0] | |
j = tup[1] | |
if matrix_type == "sparse": | |
protein1 = proteinListNew[i] | |
protein2 = proteinListNew[j] | |
real = human_proteinSimilarityMatrix.loc[protein1, protein2] | |
tupNew = (tup[0],tup[1],real) | |
protParamListNew.append(tupNew) | |
else: | |
if j > i: | |
protein1 = proteinListNew[i] | |
protein2 = proteinListNew[j] | |
real = human_proteinSimilarityMatrix.loc[protein1, protein2] | |
tupNew = (tup[0],tup[1],real) | |
protParamListNew.append(tupNew) | |
pool = multiprocessing.Pool() | |
similarity_listRet = [] | |
for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True): | |
pass | |
pool.close() | |
pool.join() | |
real_distance_list = [value[0] for value in similarity_listRet] | |
manhattan_distance_list = [value[1] for value in similarity_listRet] | |
manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list) | |
return (manhattanCorr[0], manhattanCorr[1]) | |
def calculate_all_correlations(): | |
results = {} | |
for similarity_matrix_type in similarity_tasks: | |
matrix_results = {} | |
for aspect in ["MF", "BP", "CC"]: | |
corr, p_value = calculateCorrelationforOntology(aspect, similarity_matrix_type) | |
corr_key = f"{similarity_matrix_type}_{aspect}_correlation" | |
p_value_key = f"{similarity_matrix_type}_{aspect}_pvalue" | |
results[corr_key] = corr | |
results[p_value_key] = p_value | |
return results |