#!/usr/bin/env python # coding: utf-8 import os script_dir = os.path.dirname(os.path.abspath(__file__)) import pandas as pd import numpy as np import itertools import multiprocessing from scipy.spatial.distance import cdist from numpy.linalg import norm from scipy.stats import spearmanr from tqdm import tqdm manager = multiprocessing.Manager() similarity_list = manager.list() proteinListNew = manager.list() representation_dataframe = "" protein_names = "" representation_name = "" similarity_tasks = "" detailed_output = False def parallelSimilarity(paramList): protein_embedding_dataframe = representation_dataframe i = paramList[0] j = paramList[1] if j > i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] if protein1 in protein_names and protein2 in protein_names: prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item()) prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item()) # Calculate Manhattan Distance and normalize manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock') manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1)) manhattanSim = 1 - manhattanDistNorm.item() if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0: manhattanSim = 1.0 real = paramList[2] similarity_list.append((real, manhattanSim)) return similarity_list def calculateCorrelationforOntology(aspect, matrix_type): similarity_list[:] = [] proteinListNew[:] = [] similarityMatrixNameDict = { "All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"), "500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), "sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), "200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv") } similarityMatrixFileName = similarityMatrixNameDict[matrix_type] human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName) human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True) proteinList = human_proteinSimilarityMatrix.columns for prot in proteinList: proteinListNew.append(prot) if matrix_type == "sparse": sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy") sparsified_similarity_coordinates = np.load(sparsified_path) protParamList = sparsified_similarity_coordinates else: i = range(len(proteinList)) j = range(len(proteinList)) protParamList = list(itertools.product(i, j)) protParamListNew = [] for tup in tqdm(protParamList): i = tup[0] j = tup[1] if matrix_type == "sparse": protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1, protein2] tupNew = (tup[0],tup[1],real) protParamListNew.append(tupNew) else: if j > i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1, protein2] tupNew = (tup[0],tup[1],real) protParamListNew.append(tupNew) pool = multiprocessing.Pool() similarity_listRet = [] for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True): pass pool.close() pool.join() real_distance_list = [value[0] for value in similarity_listRet] manhattan_distance_list = [value[1] for value in similarity_listRet] manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list) return (manhattanCorr[0], manhattanCorr[1]) def calculate_all_correlations(): results = {} for similarity_matrix_type in similarity_tasks: matrix_results = {} for aspect in ["MF", "BP", "CC"]: corr, p_value = calculateCorrelationforOntology(aspect, similarity_matrix_type) corr_key = f"{similarity_matrix_type}_{aspect}_correlation" p_value_key = f"{similarity_matrix_type}_{aspect}_pvalue" results[corr_key] = corr results[p_value_key] = p_value return results