import sys import click import numpy as np from sklearn.neighbors import KDTree import pandas as pd from tqdm import tqdm def rhoDelta(data,resol,dc,radius): pos = data[[1, 4]].to_numpy() // resol posTree = KDTree(pos, leaf_size=30, metric='chebyshev') NNindexes, NNdists = posTree.query_radius(pos, r=radius, return_distance=True) _l = [] for v in NNindexes: _l.append(len(v)) _l=np.asarray(_l) data = data[_l>5].reset_index(drop=True) if data.shape[0] != 0: pos = data[[1, 4]].to_numpy() // resol val = data[6].to_numpy() try: posTree = KDTree(pos, leaf_size=30, metric='chebyshev') NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True) except ValueError as e: if "Found array with 0 sample(s)" in str(e): print("#"*88,'\n#') print("#\033[91m Error!!! The data is too sparse. Please decrease the value of: [t]\033[0m\n#") print("#"*88,'\n') sys.exit(1) else: raise rhos = [] for i in range(len(NNindexes)): rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]])) rhos = np.asarray(rhos) _r = 100 _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True) deltas = rhos * 0 LargerNei = rhos * 0 - 1 for i in range(len(_indexes)): idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]]) if idx.shape[0] == 0: deltas[i] = _dists[i][-1] + 1 else: LargerNei[i] = _indexes[i][idx[0]] deltas[i] = _dists[i][idx[0]] failed = np.argwhere(LargerNei == -1).flatten() while len(failed) > 1 and _r < 100000: _r = _r * 10 _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True) for i in range(len(_indexes)): idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]]) if idx.shape[0] == 0: deltas[failed[i]] = _dists[i][-1] + 1 else: LargerNei[failed[i]] = _indexes[i][idx[0]] deltas[failed[i]] = _dists[i][idx[0]] failed = np.argwhere(LargerNei == -1).flatten() data['rhos']=rhos data['deltas']=deltas else: data['rhos']=[] data['deltas']=[] return data @click.command() @click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]') @click.option('-t','--threshold', type=float, default=0.6, help='Loop score threshold [0.6]') @click.option('-r','--resol', default=5000, help='resolution [5000]') @click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]') @click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]') @click.option('-i','--candidates', type=str,required=True,help ='Loop candidates file path') @click.option('-o','--output', type=str,required=True,help ='.bedpe file path to save loops') def pool(distance_cutoff,candidates,resol,mindelta,threshold,output,radius,refine=True): """Call loops from loop candidates by clustering """ print('\npolaris loop pool START :) ') data = pd.read_csv(candidates, sep='\t', header=None) ccs = set(data.iloc[:,0]) if data.shape[0] == 0: print("#"*88,'\n#') print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#") print("#"*88,'\n') sys.exit(1) data = data[data[6] > threshold].reset_index(drop=True) data = data[data[4] - data[1] > 11*resol].reset_index(drop=True) if data.shape[0] == 0: print("#"*88,'\n#') print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#") print("#"*88,'\n') sys.exit(1) data[['rhos','deltas']]=0 data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=distance_cutoff,radius=radius).reset_index(drop=True) minrho=0 targetData=data.reset_index(drop=True) loopPds=[] chroms=tqdm(set(targetData[0]), dynamic_ncols=True) for chrom in chroms: chroms.desc = f"[Runing clustering on {chrom}]" data = targetData[targetData[0]==chrom].reset_index(drop=True) pos = data[[1, 4]].to_numpy() // resol posTree = KDTree(pos, leaf_size=30, metric='chebyshev') rhos = data['rhos'].to_numpy() deltas = data['deltas'].to_numpy() centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten() _r = 100 _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True) LargerNei = rhos * 0 - 1 for i in range(len(_indexes)): idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]]) if idx.shape[0] == 0: pass else: LargerNei[i] = _indexes[i][idx[0]] failed = np.argwhere(LargerNei == -1).flatten() while len(failed) > 1 and _r < 100000: _r = _r * 10 _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True) for i in range(len(_indexes)): idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]]) if idx.shape[0] == 0: pass else: LargerNei[failed[i]] = _indexes[i][idx[0]] failed = np.argwhere(LargerNei == -1).flatten() LargerNei = LargerNei.astype(int) label = LargerNei * 0 - 1 for i in range(len(centroid)): label[centroid[i]] = i decreasingsortedIdxRhos = np.argsort(-rhos) for i in decreasingsortedIdxRhos: if label[i] == -1: label[i] = label[LargerNei[i]] val = data[6].to_numpy() refinedLoop = [] label = label.flatten() for l in set(label): idx = np.argwhere(label == l).flatten() if len(idx) > 0: refinedLoop.append(idx[np.argmax(val[idx])]) if refine: loopPds.append(data.loc[refinedLoop]) else: loopPds.append(data.loc[centroid]) loopPd=pd.concat(loopPds).sort_values(6,ascending=False) loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int) loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False) ccs_ = set(loopPd.iloc[:,0]) badc = ccs.difference(ccs_) if len(badc) == len(ccs): raise ValueError("polaris loop pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.") else: print(f'\npolaris loop pool FINISHED :)\n{len(loopPd)} loops saved to {output}') if len(badc) > 0: print(f"But the loop score of {badc} are too sparse.\nYou may need to check the mcool data or re-run polaris loop score by increasing -s.") if __name__ == '__main__': pool()