Spaces:
Sleeping
Sleeping
import os | |
import sys | |
import argparse | |
from logging import getLogger | |
import pandas as pd | |
import numpy as np | |
import time | |
from tqdm import tqdm | |
from tensorflow.keras.models import model_from_json | |
from scipy.sparse import csr_matrix, triu | |
import streamlit as st | |
logger = getLogger(__name__) | |
def anchor_list_to_dict(anchors): | |
anchor_dict = {} | |
for i, anchor in enumerate(anchors): | |
anchor_dict[anchor] = i | |
return anchor_dict | |
def anchor_to_locus(anchor_dict): | |
def f(anchor): | |
return anchor_dict[anchor] | |
return f | |
def locus_to_anchor(anchor_list): | |
def f(locus): | |
return anchor_list[locus] | |
return f | |
def predict_tile(args): | |
model, shared_denoised, shared_overlap, matrix, window_x, window_y = args | |
tile = matrix[window_x, window_y].A # split matrix into tiles | |
if tile.shape == (small_matrix_size, small_matrix_size): | |
tile = np.expand_dims(tile, 0) # add channel dimension | |
tile = np.expand_dims(tile, 3) # add batch dimension | |
tmp_denoised = np.ctypeslib.as_array(shared_denoised) | |
tmp_overlap = np.ctypeslib.as_array(shared_overlap) | |
denoised = model.predict(tile).reshape((small_matrix_size, small_matrix_size)) | |
denoised[denoised < 0] = 0 # remove any negative values | |
tmp_denoised[window_x, window_y] += denoised | |
tmp_overlap[window_x, window_y] += 1 | |
def sparse_prediction_from_file( | |
model, | |
matrix, | |
anchor_list, | |
small_matrix_size=128, | |
step_size=64, | |
max_dist=384, | |
keep_zeros=True, | |
): | |
input_matrix_size = len(anchor_list) | |
denoised_matrix = np.zeros_like(matrix.A) # matrix to store denoised values | |
overlap_counts = np.zeros_like( | |
matrix.A | |
) # stores number of overlaps per ratio value | |
start_time = time.time() | |
for i in range(0, input_matrix_size, step_size): | |
for j in range(0, input_matrix_size, step_size): | |
if abs(i - j) > max_dist: # max distance from diagonal with actual values | |
continue | |
rows = slice(i, i + small_matrix_size) | |
cols = slice(j, j + small_matrix_size) | |
if i + small_matrix_size >= input_matrix_size: | |
rows = slice(input_matrix_size - small_matrix_size, input_matrix_size) | |
if j + small_matrix_size >= input_matrix_size: | |
cols = slice(input_matrix_size - small_matrix_size, input_matrix_size) | |
tile = matrix[rows, cols].A # split matrix into tiles | |
if tile.shape == (small_matrix_size, small_matrix_size): | |
tile = np.expand_dims(tile, 0) # add channel dimension | |
tile = np.expand_dims(tile, 3) # add batch dimension | |
denoised = model.predict(tile).reshape( | |
(small_matrix_size, small_matrix_size) | |
) | |
denoised[denoised < 0] = 0 # remove any negative values | |
denoised_matrix[ | |
rows, cols | |
] += denoised # add denoised ratio values to whole matrix | |
overlap_counts[ | |
rows, cols | |
] += 1 # add to all overlap values within tiled region | |
# print('Predicted matrix in %d seconds' % (time.time() - start_time)) | |
# start_time = time.time() | |
denoised_matrix = np.divide( | |
denoised_matrix, | |
overlap_counts, | |
out=np.zeros_like(denoised_matrix), | |
where=overlap_counts != 0, | |
) # average all overlapping areas | |
denoised_matrix = (denoised_matrix + denoised_matrix.T) * 0.5 # force symmetry | |
np.fill_diagonal(denoised_matrix, 0) # set all diagonal values to 0 | |
sparse_denoised_matrix = triu(denoised_matrix, format="coo") | |
if not keep_zeros: | |
sparse_denoised_matrix.eliminate_zeros() | |
# print('Averaging/symmetry, and converting to COO matrix in %d seconds' % (time.time() - start_time)) | |
return sparse_denoised_matrix | |
def predict_and_write( | |
model, | |
full_matrix_dir, | |
input_name, | |
outdir, | |
anchor_dir, | |
chromosome, | |
small_matrix_size, | |
step_size, | |
dummy=5, | |
max_dist=384, | |
val_cols=["obs", "exp"], | |
keep_zeros=True, | |
matrices_per_tile=8, | |
): | |
start_time = time.time() | |
anchor_file = os.path.join(anchor_dir, chromosome + ".bed") | |
anchor_list = pd.read_csv( | |
anchor_file, | |
sep="\t", | |
usecols=[0, 1, 2, 3], | |
names=["chr", "start", "end", "anchor"], | |
) # read anchor list file | |
start_time = time.time() | |
logger.debug("anchor file") | |
logger.debug(os.path.join(full_matrix_dir, input_name)) | |
chr_anchor_file = pd.read_csv( | |
os.path.join(full_matrix_dir, input_name), | |
delimiter="\t", | |
names=["anchor1", "anchor2"] + val_cols, | |
usecols=["anchor1", "anchor2"] + val_cols, | |
) # read chromosome anchor to anchor file | |
if "obs" in val_cols and "exp" in val_cols: | |
chr_anchor_file["ratio"] = (chr_anchor_file["obs"] + dummy) / ( | |
chr_anchor_file["exp"] + dummy | |
) # compute matrix ratio value | |
assert ( | |
"ratio" not in val_cols | |
), "Must provide either ratio column or obs and exp columns to compute ratio" | |
denoised_anchor_to_anchor = pd.DataFrame() | |
start_time = time.time() | |
anchor_step = matrices_per_tile * small_matrix_size | |
for i in tqdm(range(0, len(anchor_list), anchor_step)): | |
anchors = anchor_list[i : i + anchor_step] | |
# print(anchors) | |
anchor_dict = anchor_list_to_dict( | |
anchors["anchor"].values | |
) # convert to anchor --> index dictionary | |
chr_tile = chr_anchor_file[ | |
(chr_anchor_file["anchor1"].isin(anchors["anchor"])) | |
& (chr_anchor_file["anchor2"].isin(anchors["anchor"])) | |
] | |
rows = np.vectorize(anchor_to_locus(anchor_dict))( | |
chr_tile["anchor1"].values | |
) # convert anchor names to row indices | |
cols = np.vectorize(anchor_to_locus(anchor_dict))( | |
chr_tile["anchor2"].values | |
) # convert anchor names to column indices | |
logger.debug(chr_tile) | |
sparse_matrix = csr_matrix( | |
(chr_tile["ratio"], (rows, cols)), | |
shape=(anchor_step, anchor_step), | |
) # construct sparse CSR matrix | |
sparse_denoised_tile = sparse_prediction_from_file( | |
model, | |
sparse_matrix, | |
anchors, | |
small_matrix_size, | |
step_size, | |
max_dist, | |
keep_zeros=keep_zeros, | |
) | |
if len(sparse_denoised_tile.row) > 0: | |
anchor_name_list = anchors["anchor"].values.tolist() | |
anchor_1_list = np.vectorize(locus_to_anchor(anchor_name_list))( | |
sparse_denoised_tile.row | |
) | |
anchor_2_list = np.vectorize(locus_to_anchor(anchor_name_list))( | |
sparse_denoised_tile.col | |
) | |
anchor_to_anchor_dict = { | |
"anchor1": anchor_1_list, | |
"anchor2": anchor_2_list, | |
"denoised": sparse_denoised_tile.data, | |
} | |
tile_anchor_to_anchor = pd.DataFrame.from_dict(anchor_to_anchor_dict) | |
tile_anchor_to_anchor = tile_anchor_to_anchor.round({"denoised": 4}) | |
denoised_anchor_to_anchor = pd.concat( | |
[denoised_anchor_to_anchor, tile_anchor_to_anchor] | |
) | |
print("Denoised matrix in %d seconds" % (time.time() - start_time)) | |
start_time = time.time() | |
denoised_anchor_to_anchor.to_csv( | |
os.path.join(outdir, chromosome + ".denoised.anchor.to.anchor"), | |
sep="\t", | |
index=False, | |
header=False, | |
) | |
return denoised_anchor_to_anchor | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--full_matrix_dir", | |
type=str, | |
help="directory containing chromosome interaction files to be used as input", | |
) | |
parser.add_argument( | |
"--input_name", | |
type=str, | |
help="name of file in full_matrix_dir that we want to feed into model", | |
) | |
parser.add_argument("--h5_file", type=str, help="path to model weights .h5 file") | |
parser.add_argument( | |
"--json_file", | |
type=str, | |
help="path to model architecture .json file (by default it is assumed to be the same as the weights file)", | |
) | |
parser.add_argument( | |
"--outdir", | |
type=str, | |
help="directory where the output interaction file will be stored", | |
) | |
parser.add_argument( | |
"--anchor_dir", | |
type=str, | |
help="directory containing anchor .bed reference files", | |
) | |
parser.add_argument( | |
"--chromosome", type=str, help="chromosome string (e.g chr1, chr20, chrX)" | |
) | |
parser.add_argument( | |
"--small_matrix_size", | |
type=int, | |
default=128, | |
help="size of input tiles (symmetric)", | |
) | |
parser.add_argument( | |
"--step_size", | |
type=int, | |
default=128, | |
help="step size when tiling matrix (overlapping values will be averaged if different)", | |
) | |
parser.add_argument( | |
"--max_dist", | |
type=int, | |
default=384, | |
help="maximum distance from diagonal (in pixels) where we consider interactions (default to ~2Mb)", | |
) | |
parser.add_argument( | |
"--dummy", | |
type=int, | |
default=5, | |
help="dummy value to compute ratio (obs + dummy) / (exp + dummy)", | |
) | |
parser.add_argument( | |
"--val_cols", | |
"--list", | |
nargs="+", | |
help="names of value columns in interaction files (not including a1, a2)", | |
default=["obs", "exp"], | |
) | |
parser.add_argument( | |
"--keep_zeros", | |
action="store_true", | |
help="if provided, the output file will contain all pixels in every tile, even if no value is present", | |
) | |
args = parser.parse_args() | |
full_matrix_dir = args.full_matrix_dir | |
input_name = args.input_name | |
h5_file = args.h5_file | |
if args.json_file is not None: | |
json_file = args.json_file | |
else: | |
json_file = args.h5_file.replace("h5", "json") | |
outdir = args.outdir | |
anchor_dir = args.anchor_dir | |
chromosome = args.chromosome | |
small_matrix_size = args.small_matrix_size | |
step_size = args.step_size | |
dummy = args.dummy | |
max_dist = args.max_dist | |
val_cols = args.val_cols | |
keep_zeros = args.keep_zeros | |
os.makedirs(outdir, exist_ok=True) | |
with open(json_file, "r") as f: | |
model = model_from_json(f.read()) # load model | |
model.load_weights(h5_file) # load model weights | |
predict_and_write( | |
model, | |
full_matrix_dir, | |
input_name, | |
outdir, | |
anchor_dir, | |
chromosome, | |
small_matrix_size, | |
step_size, | |
dummy, | |
max_dist, | |
val_cols, | |
keep_zeros, | |
) | |