dylanplummer commited on
Commit
b3dae8f
·
1 Parent(s): 54e7bcd

add utils

Browse files
Files changed (1) hide show
  1. utils/utils.py +819 -0
utils/utils.py ADDED
@@ -0,0 +1,819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import re
4
+ import cv2
5
+ import random
6
+ import pickle
7
+ import numpy as np
8
+ import tensorflow.keras.backend as K
9
+ import pandas as pd
10
+ import matplotlib.pyplot as plt
11
+ import matplotlib.colors
12
+ import matplotlib.cm
13
+ import scipy.sparse
14
+ from scipy.sparse import coo_matrix, csr_matrix, triu, tril
15
+ import scipy.ndimage
16
+
17
+ chromosome_labels = {'chr1': 0, 'chr2': 1, 'chr3': 2, 'chr4': 3, 'chr5': 4, 'chr6': 5, 'chr7': 6, 'chr8': 7, 'chr9': 8,
18
+ 'chr10': 9, 'chr11': 10, 'chr12': 11, 'chr13': 12, 'chr14': 13, 'chr15': 14, 'chr16': 15, 'chr17': 16, 'chr18': 17,
19
+ 'chr19': 18, 'chr20': 19, 'chr21': 20, 'chr22': 21, 'chrX': 22, 'chrY': 23}
20
+
21
+ data_dir = 'data/'
22
+ sparse_data_dir = 'data/sparse/'
23
+ try:
24
+ os.mkdir(data_dir)
25
+ except FileExistsError:
26
+ pass
27
+ try:
28
+ os.mkdir(sparse_data_dir)
29
+ except FileExistsError:
30
+ pass
31
+
32
+
33
+ def open_anchor_to_anchor(filename):
34
+ '''
35
+ Read a tab delimited anchor to anchor file as a DataFrame
36
+ Args:
37
+ filename (:obj:`str`) : full path to anchor to anchor file
38
+
39
+ Returns:
40
+ ``pandas.DataFrame``: if reading a normalized anchor to anchor file, columns are ``a1 a2 obs exp ratio``
41
+ and if reading a denoised or enhanced anchor to anchor file, columns are ``a1 a2 ratio``
42
+ '''
43
+ df = pd.read_csv(filename, sep='\t')
44
+ n_cols = len(df.columns)
45
+ if n_cols == 4: # if before denoise top loops
46
+ df = pd.read_csv(filename,
47
+ sep='\t',
48
+ names=['anchor1', 'anchor2', 'obs', 'exp'])
49
+ df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5)
50
+ elif n_cols == 5: # includes p-value
51
+ df = pd.read_csv(filename,
52
+ sep='\t',
53
+ names=['anchor1', 'anchor2', 'obs', 'exp', 'p_val'])
54
+ df['ratio'] = (df['obs'] + 5) / (df['exp'] + 5)
55
+ else: # after denoise has no obs or exp
56
+ df = pd.read_csv(filename,
57
+ sep='\t',
58
+ names=['anchor1', 'anchor2', 'ratio'])
59
+ df = df[['anchor1', 'anchor2', 'ratio']]
60
+ return df
61
+
62
+
63
+ def open_full_genome(data_dir):
64
+ '''
65
+
66
+ Args:
67
+ data_dir:
68
+
69
+ Returns:
70
+
71
+ '''
72
+ genome = pd.DataFrame()
73
+ print('Opening genome-wide anchor to anchor...')
74
+ for chr_file in os.listdir(data_dir):
75
+ if 'anchor_2_anchor' in chr_file or 'denoised.anchor.to.anchor' in chr_file:
76
+ print(chr_file)
77
+ genome = pd.concat([genome, open_anchor_to_anchor(data_dir + '/' + chr_file)])
78
+ return genome
79
+
80
+
81
+ def get_chromosome_from_filename(filename):
82
+ """
83
+ Extract the chromosome string from any of the file name formats we use
84
+
85
+ Args:
86
+ filename (:obj:`str`) : name of anchor to anchor file
87
+
88
+ Returns:
89
+ Chromosome string of form chr<>
90
+ """
91
+ chr_index = filename.find('chr') # index of chromosome name
92
+ if chr_index == 0: # if chromosome name is file prefix
93
+ return filename[:filename.find('.')]
94
+ file_ending_index = filename.rfind('.') # index of file ending
95
+ if chr_index > file_ending_index: # if chromosome name is file ending
96
+ return filename[chr_index:]
97
+ else:
98
+ return filename[chr_index: file_ending_index]
99
+
100
+
101
+ def locus_to_anchor(chr_name, locus, anchor_dir):
102
+ anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t',
103
+ names=['chr', 'start', 'end', 'anchor']) # read anchor list file
104
+ loci_indices = (anchor_list['start'] <= locus) & (locus <= anchor_list['end']) & (
105
+ anchor_list['chr'] == chr_name)
106
+ print(np.where(loci_indices)[0][0])
107
+ return int(np.where(loci_indices)[0][0])
108
+
109
+
110
+ def save_samples(input_dir, target_dir, matrix_size, multi_input=False, dir_3=None, combined_dir=None, anchor_dir=None, name='sample', chr_name='chr6', locus_start=25922605, locus_end=26709867, force_size=128, force_symmetry=True):
111
+ """
112
+ Saves sample matrices for use in training visualizations
113
+
114
+ Args:
115
+ input_dir (:obj:`str`) : directory containing input anchor to anchor files
116
+ target_dir (:obj:`str`) : directory containing target anchor to anchor files
117
+ matrix_size (:obj:`int`) : size of each sample matrix
118
+ multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir``
119
+ dir_3 (:obj:`str`) : optional directory containing third set of input anchor to anchor files
120
+ combined_dir (:obj:`str`) : optional directory containing combined target anchor to anchor files
121
+ anchor_dir (:obj:`str`) : directory containing anchor reference ``.bed`` files
122
+ name (:obj:`str`) : each saved sample file will begin with this string
123
+ chr_index (:obj:`int`) : index of chromosome to save samples from
124
+ locus (:obj:`int`) : index of anchor to save samples from
125
+ """
126
+ global data_dir
127
+ global sparse_data_dir
128
+ try:
129
+ os.mkdir(sparse_data_dir)
130
+ except FileExistsError as e:
131
+ pass
132
+ if multi_input:
133
+ input_folder_1 = os.listdir(input_dir)[0] + '/'
134
+ input_folder_2 = os.listdir(input_dir)[1] + '/'
135
+ try:
136
+ input_folder_3 = os.listdir(input_dir)[2] + '/'
137
+ except IndexError:
138
+ pass
139
+ chr_index = min(int(chr_name.replace('chr', '')), len(os.listdir(input_dir + input_folder_1)) - 1)
140
+ print('Saving samples from', chr_name, '...')
141
+ if (name == 'enhance' or name == 'val_enhance') and multi_input:
142
+ matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry)
143
+ matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
144
+ matrix_3 = None
145
+ combined_matrix = None
146
+ else:
147
+ if multi_input:
148
+ matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_1, os.listdir(input_dir + input_folder_1)[chr_index], anchor_dir, force_symmetry=force_symmetry)
149
+ matrix_2 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_2, os.listdir(input_dir + input_folder_2)[chr_index], anchor_dir, force_symmetry=force_symmetry)
150
+ matrix_3 = load_chr_ratio_matrix_from_sparse(input_dir + input_folder_3, os.listdir(input_dir + input_folder_3)[chr_index], anchor_dir, force_symmetry=force_symmetry)
151
+ combined_matrix = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
152
+ else:
153
+ matrix_1 = load_chr_ratio_matrix_from_sparse(input_dir, os.listdir(input_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
154
+ matrix_2 = load_chr_ratio_matrix_from_sparse(target_dir, os.listdir(target_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
155
+ if dir_3 is not None:
156
+ matrix_3 = load_chr_ratio_matrix_from_sparse(dir_3, os.listdir(dir_3)[chr_index], anchor_dir, force_symmetry=force_symmetry)
157
+ else:
158
+ matrix_3 = None
159
+ if combined_dir is not None:
160
+ combined_matrix = load_chr_ratio_matrix_from_sparse(combined_dir, os.listdir(combined_dir)[chr_index], anchor_dir, force_symmetry=force_symmetry)
161
+ else:
162
+ combined_matrix = None
163
+ i = locus_to_anchor(chr_name, locus_start, anchor_dir)
164
+ j = locus_to_anchor(chr_name, locus_end, anchor_dir)
165
+ mid = int((i + j) / 2)
166
+ i = max(0, mid - int(force_size / 2))
167
+ j = i + force_size
168
+ rows = slice(i, j)
169
+ cols = slice(i, j)
170
+ tile_1 = matrix_1[rows, cols].A
171
+ tile_2 = matrix_2[rows, cols].A
172
+ tile_1 = np.expand_dims(tile_1, -1) # add channel dimension
173
+ tile_1 = np.expand_dims(tile_1, 0) # model expects a list of inputs
174
+ tile_2 = np.expand_dims(tile_2, -1)
175
+ tile_2 = np.expand_dims(tile_2, 0)
176
+ if matrix_3 is not None:
177
+ tile_3 = matrix_3[i:i + matrix_size, j:j + matrix_size].A
178
+ tile_3 = np.expand_dims(tile_3, -1)
179
+ tile_3 = np.expand_dims(tile_3, 0)
180
+ np.save('%s%s_3' % (data_dir, name), tile_3)
181
+ if combined_matrix is not None:
182
+ combined_tile = combined_matrix[i:i + matrix_size, j:j + matrix_size].A
183
+ combined_tile = np.expand_dims(combined_tile, -1)
184
+ combined_tile = np.expand_dims(combined_tile, 0)
185
+ np.save('%s%s_combined' % (data_dir, name), combined_tile)
186
+ np.save('%s%s_1' % (data_dir, name), tile_1)
187
+ np.save('%s%s_2' % (data_dir, name), tile_2)
188
+
189
+
190
+ def load_chr_ratio_matrix_from_sparse(dir_name, file_name, anchor_dir, sparse_dir=None, anchor_list=None, chr_name=None, dummy=5, ignore_sparse=False, force_symmetry=True, use_raw=False):
191
+ """
192
+ Loads data as a sparse matrix by either reading a precompiled sparse matrix or an anchor to anchor file which is converted to sparse CSR format.
193
+ Ratio values are computed using the observed (obs) and expected (exp) values:
194
+
195
+ .. math::
196
+ ratio = \\frac{obs + dummy}{exp + dummy}
197
+
198
+ Args:
199
+ dir_name (:obj:`str`) : directory containing the anchor to anchor or precompiled (.npz) sparse matrix file
200
+ file_name (:obj:`str`) : name of anchor to anchor or precompiled (.npz) sparse matrix file
201
+ anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
202
+ dummy (:obj:`int`) : dummy value to used when computing ratio values
203
+ ignore_sparse (:obj:`bool`) : set to True to ignore precompiled sparse matrices even if they exist
204
+
205
+ Returns:
206
+ ``scipy.sparse.csr_matrix``: sparse matrix of ratio values
207
+ """
208
+ global data_dir
209
+ global sparse_data_dir
210
+ if chr_name is None:
211
+ chr_name = get_chromosome_from_filename(file_name)
212
+ sparse_rep_dir = dir_name[dir_name[: -1].rfind('/') + 1:] # directory where the pre-compiled sparse matrices are saved
213
+ if sparse_dir is not None:
214
+ sparse_data_dir = sparse_dir
215
+ os.makedirs(os.path.join(sparse_data_dir, sparse_rep_dir), exist_ok=True)
216
+ if file_name.endswith('.npz'): # loading pre-combined and pre-compiled sparse data
217
+ sparse_matrix = scipy.sparse.load_npz(dir_name + file_name)
218
+ else: # load from file name
219
+ if file_name + '.npz' in os.listdir(os.path.join(sparse_data_dir, sparse_rep_dir)) and not ignore_sparse: # check if pre-compiled data already exists
220
+ sparse_matrix = scipy.sparse.load_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name + '.npz'))
221
+ else: # otherwise generate sparse matrix from anchor2anchor file and save pre-compiled data
222
+ if anchor_list is None:
223
+ if anchor_dir is None:
224
+ assert 'You must supply either an anchor reference list or the directory containing one'
225
+ anchor_list = pd.read_csv(os.path.join(anchor_dir, '%s.bed' % chr_name), sep='\t',
226
+ names=['chr', 'start', 'end', 'anchor']) # read anchor list file
227
+ matrix_size = len(anchor_list) # matrix size is needed to construct sparse CSR matrix
228
+ anchor_dict = anchor_list_to_dict(anchor_list['anchor'].values) # convert to anchor --> index dictionary
229
+ try: # first try reading anchor to anchor file as <a1> <a2> <obs> <exp>
230
+ chr_anchor_file = pd.read_csv(
231
+ os.path.join(dir_name, file_name),
232
+ delimiter='\t',
233
+ names=['anchor1', 'anchor2', 'obs', 'exp'],
234
+ usecols=['anchor1', 'anchor2', 'obs', 'exp']) # read chromosome anchor to anchor file
235
+ rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
236
+ cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
237
+ ratio = (chr_anchor_file['obs'] + dummy) / (chr_anchor_file['exp'] + dummy) # compute matrix ratio value
238
+ sparse_matrix = scipy.sparse.csr_matrix((ratio, (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
239
+ except: # otherwise read anchor to anchor file as <a1> <a2> <ratio>
240
+ chr_anchor_file = pd.read_csv(
241
+ os.path.join(dir_name, file_name),
242
+ delimiter='\t',
243
+ names=['anchor1', 'anchor2', 'ratio'],
244
+ usecols=['anchor1', 'anchor2', 'ratio'])
245
+ rows = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor1'].values) # convert anchor names to row indices
246
+ cols = np.vectorize(anchor_to_locus(anchor_dict))(chr_anchor_file['anchor2'].values) # convert anchor names to column indices
247
+ if use_raw:
248
+ sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['obs'], (rows, cols)), shape=(
249
+ matrix_size, matrix_size)) # construct sparse CSR matrix
250
+ else:
251
+ sparse_matrix = scipy.sparse.csr_matrix((chr_anchor_file['ratio'], (rows, cols)), shape=(matrix_size, matrix_size)) # construct sparse CSR matrix
252
+ if force_symmetry:
253
+ upper_sum = triu(sparse_matrix, k=1).sum()
254
+ lower_sum = tril(sparse_matrix, k=-1).sum()
255
+ if upper_sum == 0 or lower_sum == 0:
256
+ sparse_matrix = sparse_matrix + sparse_matrix.transpose()
257
+ sparse_triu = scipy.sparse.triu(sparse_matrix)
258
+ sparse_matrix = sparse_triu + sparse_triu.transpose()
259
+ if not ignore_sparse:
260
+ scipy.sparse.save_npz(os.path.join(sparse_data_dir, sparse_rep_dir, file_name), sparse_matrix) # save precompiled data
261
+ return sparse_matrix
262
+
263
+
264
+ def split_matrix(input_filename,
265
+ input_matrix,
266
+ target_matrix,
267
+ input_batch,
268
+ target_batch,
269
+ matrix_size,
270
+ step_size,
271
+ batch_size,
272
+ n_matrices,
273
+ start_index,
274
+ normalize,
275
+ shuffle,
276
+ random_steps,
277
+ diagonal_only,
278
+ upper_triangular_only):
279
+ """
280
+ Generator function to split input and target sparse matrices into patches which are used for training and prediction.
281
+
282
+ Args:
283
+ input_filename (:obj:`str`): name of file which is being used to generate ratio matrix patches
284
+ input_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR input matrix
285
+ target_matrix (:obj:`scipy.sparse.csr_matrix`) : sparse CSR target matrix
286
+ input_batch (:obj:`numpy.array`) : current array of samples in the input batch being generated
287
+ target_batch (:obj:`numpy.array`) : current array of samples in the target batch being generated
288
+ matrix_size (:obj:`int`) : size of each patch
289
+ step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions
290
+ batch_size (:obj:`int`) : number of patches to use in each batch
291
+ n_matrices (:obj:`int`) : current number of matrix patches in the batch being generated
292
+ start_index (:obj:`int`) : starting anchor index of the matrix splitting, ensures batches are not identical across epochs
293
+ normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]``
294
+ shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially
295
+ random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices
296
+ diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix
297
+
298
+ Returns:
299
+ (``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label
300
+ """
301
+ if matrix_size == -1:
302
+ input_matrix = np.expand_dims(np.expand_dims(input_matrix.A, 0), -1)
303
+ target_matrix = np.expand_dims(np.expand_dims(target_matrix.A, 0), -1)
304
+ yield input_matrix, target_matrix, input_filename + '_full_chr'
305
+ else:
306
+ if random_steps: # random offset from step size intervals
307
+ start_index = np.random.randint(0, step_size)
308
+ row_indices = np.arange(start_index, input_matrix.shape[0], step_size)
309
+ col_indices = np.arange(start_index, input_matrix.shape[1], step_size)
310
+ if shuffle: # shuffle slicing indices
311
+ np.random.shuffle(row_indices)
312
+ np.random.shuffle(col_indices)
313
+ for i in row_indices:
314
+ for j in col_indices:
315
+ if abs(i - j) > 384: # max distance from diagonal with actual values
316
+ continue
317
+ if diagonal_only and i != j:
318
+ continue
319
+ if upper_triangular_only and i < j:
320
+ continue
321
+ input_tile = input_matrix[i:i + matrix_size, j:j + matrix_size].A
322
+ target_tile = target_matrix[i:i + matrix_size, j:j + matrix_size].A
323
+ #input_tile = np.expand_dims(input_tile, axis=-1)
324
+ #target_tile = np.expand_dims(target_tile, axis=-1)
325
+ input_batch.append(input_tile)
326
+ target_batch.append(target_tile)
327
+ n_matrices += 1
328
+ if n_matrices == batch_size:
329
+ try:
330
+ input_batch = np.reshape(np.array(input_batch), (n_matrices, matrix_size, matrix_size, 1))
331
+ target_batch = np.reshape(np.array(target_batch), (n_matrices, matrix_size, matrix_size, 1))
332
+ if normalize:
333
+ input_batch = normalize_matrix(input_batch)
334
+ target_batch = normalize_matrix(target_batch)
335
+
336
+ yield input_batch, target_batch, input_filename + '_' + str(i)
337
+ except ValueError as e: # reached end of valid values
338
+ input_batch = []
339
+ target_batch = []
340
+ n_matrices = 0
341
+ pass
342
+ input_batch = []
343
+ target_batch = []
344
+ n_matrices = 0
345
+
346
+
347
+
348
+ def generate_batches_from_chr(input_dir,
349
+ target_dir,
350
+ matrix_size,
351
+ batch_size,
352
+ anchor_dir=None,
353
+ step_size=64,
354
+ multi_input=False,
355
+ shuffle=False,
356
+ random_steps=False,
357
+ normalize=False,
358
+ diagonal_only=False,
359
+ upper_triangular_only=False,
360
+ force_symmetry=True,
361
+ ignore_XY=True,
362
+ ignore_even_chr=False,
363
+ ignore_odd_chr=False):
364
+ """
365
+ Generator function which generates batches of input target pairs to train the model:
366
+
367
+ .. code-block:: python
368
+ :linenos:
369
+
370
+ for epoch_i in range(epochs):
371
+ for input_batch, target_batch, batch_label in generate_batches_from_chr(input_dir,
372
+ target_dir,
373
+ matrix_size=128,
374
+ batch_size=64,
375
+ step_size=64,
376
+ shuffle=True,
377
+ random_steps=True,
378
+ anchor_dir=anchor_dir):
379
+ step_start_time = time.time()
380
+ loss = model.train_on_batch(noisy_batch, target_batch)
381
+ print("%d-%d %ds [Loss: %.3f][PSNR: %.3f, Jaccard: %.3f]" %
382
+ (epoch_i,
383
+ step_i,
384
+ time.time() - step_start_time,
385
+ loss[0],
386
+ loss[1],
387
+ loss[2]
388
+ ))
389
+ step_i += 1
390
+
391
+ Args:
392
+ input_dir (:obj:`str`) : directory containing all input data to be generated
393
+ target_dir (:obj:`str`) : directory containing all target data to be generated
394
+ matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into
395
+ batch_size (:obj:`int`) : number of patches to use in each batch
396
+ anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
397
+ step_size (:obj:`int`) : size of steps used when generating batches. Values less than ``matrix size`` will include overlapping regions
398
+ multi_input (:obj:`bool`) : set to True to save samples from each of the multiple input sets in ``input_dir``
399
+ shuffle (:obj:`bool`) : set to True to randomly split the matrix instead of sliding across sequentially
400
+ random_steps (:obj:`bool`) : set to True add a random offset to each step between patch indices
401
+ diagonal_only (:obj:`bool`) : set to True to only generate patches along the diagonal of the matrix
402
+ normalize (:obj:`bool`) : set to True to normalize all ratio values between ``[0, 1]``
403
+ ignore_XY (:obj:`bool`) : set to True to ignore chromosomes X and Y when generating batches
404
+ ignore_even_chr (:obj:`bool`) : set to True to ignore all even numbered chromosomes
405
+ ignore_odd_chr (:obj:`bool`) : set to True to ignore all odd numbered chromosomes
406
+
407
+ Returns:
408
+ (``numpy.array``, ``numpy.array``, ``str``): input batch, target batch, and batch label
409
+ """
410
+ input_batch = []
411
+ target_batch = []
412
+ if multi_input:
413
+ input_folders = os.listdir(input_dir) # get list of all folders in input dir
414
+ input_files = sorted(os.listdir(input_dir + input_folders[0])) # get list of input files (assume all inputs have same name pattern)
415
+ target_files = sorted(os.listdir(target_dir))
416
+ '''
417
+ # remove duplicates of chromosomes
418
+ tmp = []
419
+ for f in input_files:
420
+ if '.p_val' in f and f.replace('.p_val', '') in input_files:
421
+ tmp.append(f.replace('.p_val', ''))
422
+ if len(tmp) > 0:
423
+ input_files = tmp
424
+ print(input_files)
425
+ '''
426
+ else:
427
+ input_files = sorted(os.listdir(input_dir))
428
+ target_files = sorted(os.listdir(target_dir))
429
+
430
+ if shuffle: # shuffle chromosome file order
431
+ c = list(zip(input_files, target_files))
432
+ random.shuffle(c)
433
+ input_files, target_files = zip(*c)
434
+
435
+ if ignore_XY:
436
+ remove_XY = lambda files: [f for f in files if 'chrX' not in f and 'chrY' not in f]
437
+ input_files = remove_XY(input_files)
438
+ target_files = remove_XY(target_files)
439
+
440
+ if ignore_odd_chr:
441
+ # fun one-liner to remove all odd-numbered chromosomes
442
+ remove_odds = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 == 0]
443
+ input_files = remove_odds(input_files)
444
+ target_files = remove_odds(target_files)
445
+ elif ignore_even_chr:
446
+ remove_evens = lambda files: [f for f in files if f[f.index('chr') + 3:f.index('.matrix')].isdigit() and int(f[f.index('chr') + 3:f.index('.matrix')]) % 2 != 0]
447
+ input_files = remove_evens(input_files)
448
+ target_files = remove_evens(target_files)
449
+
450
+ for input_file, target_file in zip(input_files, target_files):
451
+ n_matrices = 0
452
+ start_index = 0
453
+ if multi_input:
454
+ target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry)
455
+ for input_folder in input_folders:
456
+ input_folder += '/'
457
+ input_matrix = load_chr_ratio_matrix_from_sparse(input_dir + input_folder, input_file, anchor_dir, force_symmetry=force_symmetry)
458
+ for input_batch, target_batch, figure_title in split_matrix(input_filename=input_folder + input_file,
459
+ input_matrix=input_matrix,
460
+ target_matrix=target_matrix,
461
+ input_batch=input_batch,
462
+ target_batch=target_batch,
463
+ matrix_size=matrix_size,
464
+ step_size=step_size,
465
+ batch_size=batch_size,
466
+ n_matrices=n_matrices,
467
+ start_index=start_index,
468
+ normalize=normalize,
469
+ shuffle=shuffle,
470
+ random_steps=random_steps,
471
+ diagonal_only=diagonal_only,
472
+ upper_triangular_only=upper_triangular_only):
473
+ yield input_batch, target_batch, figure_title
474
+ input_batch = []
475
+ target_batch = []
476
+ n_matrices = 0
477
+ else:
478
+ input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, input_file, anchor_dir, force_symmetry=force_symmetry)
479
+ target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, target_file, anchor_dir, force_symmetry=force_symmetry)
480
+ for input_batch, target_batch, figure_title in split_matrix(input_filename=input_file,
481
+ input_matrix=input_matrix,
482
+ target_matrix=target_matrix,
483
+ input_batch=input_batch,
484
+ target_batch=target_batch,
485
+ matrix_size=matrix_size,
486
+ step_size=step_size,
487
+ batch_size=batch_size,
488
+ n_matrices=n_matrices,
489
+ start_index=start_index,
490
+ normalize=normalize,
491
+ shuffle=shuffle,
492
+ random_steps=random_steps,
493
+ diagonal_only=diagonal_only,
494
+ upper_triangular_only=upper_triangular_only):
495
+ yield input_batch, target_batch, figure_title
496
+ input_batch = []
497
+ target_batch = []
498
+ n_matrices = 0
499
+
500
+
501
+ def get_matrices_from_loci(input_dir,
502
+ target_dir,
503
+ matrix_size,
504
+ loci,
505
+ anchor_dir=None):
506
+ """
507
+ Generator function for getting sample matrices at specific loci
508
+
509
+ Args:
510
+ input_dir (:obj:`str`) : directory containing all input data to be generated
511
+ target_dir (:obj:`str`) : directory containing all target data to be generated
512
+ matrix_size (:obj:`int`) : size of each patch that the full ratio matrix is divided into
513
+ loci (:obj:`dict`) : dictionary of chromosome locus pairs
514
+ anchor_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
515
+
516
+ Returns:
517
+ (``numpy.array``, ``numpy.array``, ``str``, ``int``, ``int``): input matrix, target matrix, chromosome name, locus, and anchor index
518
+ """
519
+ input_files = sorted_nicely(os.listdir(input_dir))
520
+ target_files = sorted_nicely(os.listdir(target_dir))
521
+
522
+ for file_1, file_2 in zip(input_files, target_files):
523
+ chr_name = get_chromosome_from_filename(file_1)
524
+ if chr_name in loci.keys():
525
+ anchor_list = pd.read_csv(anchor_dir + '%s.bed' % chr_name, sep='\t',
526
+ names=['chr', 'start', 'end', 'anchor']) # read anchor list file
527
+ else:
528
+ continue
529
+ input_matrix = load_chr_ratio_matrix_from_sparse(input_dir, file_1, anchor_dir)
530
+ target_matrix = load_chr_ratio_matrix_from_sparse(target_dir, file_2, anchor_dir)
531
+
532
+ loci_indices = (anchor_list['start'] <= loci[chr_name]) & (loci[chr_name] <= anchor_list['end']) & (anchor_list['chr'] == chr_name)
533
+
534
+ for i, locus in enumerate(loci_indices):
535
+ if locus:
536
+ input_tile = input_matrix[i:i + matrix_size, i:i + matrix_size].A
537
+ target_tile = target_matrix[i:i + matrix_size, i:i + matrix_size].A
538
+ input_tile = np.expand_dims(input_tile, axis=-1)
539
+ target_tile = np.expand_dims(target_tile, axis=-1)
540
+ input_tile = np.expand_dims(input_tile, axis=0)
541
+ target_tile = np.expand_dims(target_tile, axis=0)
542
+
543
+ yield input_tile, target_tile, chr_name, loci[chr_name], i
544
+
545
+
546
+ def get_top_loops(matrix_data_dir, reference_dir, num_top_loops=None, q=None, dummy=5):
547
+ """
548
+ Ranks the ratio values of all chromosomes and computes the cutoff value for taking the top ``num_top_loops`` or the ``q`` th quantile
549
+
550
+ Args:
551
+ matrix_data_dir (:obj:`str`) : directory containing the anchor to anchor files used to count loops
552
+ reference_dir (:obj:`str`) : directory containing the reference anchor ``.bed`` files
553
+ num_top_loops (:obj:`str`) : number of top loops to consider
554
+ q (:obj:`str`) : quantile range of loops to consider
555
+ dummy (:obj:`str`) : dummy value to use to calculate each ratio value
556
+
557
+ Returns:
558
+ ``float`` : cutoff value for top loops
559
+ """
560
+ global data_dir
561
+ if 'top_loop_values.pickle' in os.listdir(data_dir):
562
+ with open(data_dir + 'top_loop_values.pickle', 'rb') as handle:
563
+ top_loop_values = pickle.load(handle)
564
+ else:
565
+ top_loop_values = {}
566
+ if q is not None: # select top loops based on quantile not quantity
567
+ if matrix_data_dir + str(q) in top_loop_values.keys():
568
+ genome_min_loop_value = top_loop_values[matrix_data_dir + str(q)]
569
+ else:
570
+ top_loops = np.array([])
571
+ for file in os.listdir(matrix_data_dir):
572
+ sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy)
573
+ sparse = scipy.sparse.triu(sparse)
574
+ nonzero_indices = sparse.nonzero()
575
+ top_loops = np.append(top_loops, sparse.tocsr()[nonzero_indices].A)
576
+ genome_min_loop_value = np.quantile(top_loops, q=q)
577
+ top_loop_values[matrix_data_dir + str(q)] = genome_min_loop_value
578
+ print('%s %.4f quantile loops cutoff value: %f' % (matrix_data_dir, q, genome_min_loop_value))
579
+ else: # select top loops based on rank
580
+ if matrix_data_dir + str(num_top_loops) in top_loop_values.keys():
581
+ genome_min_loop_value = top_loop_values[matrix_data_dir + str(num_top_loops)]
582
+ else:
583
+ top_loops = np.array([])
584
+ for file in os.listdir(matrix_data_dir):
585
+ sparse = load_chr_ratio_matrix_from_sparse(matrix_data_dir, file, reference_dir, dummy=dummy)
586
+ sparse = scipy.sparse.triu(sparse)
587
+ loop_list = np.append(top_loops, sparse.data)
588
+ top_loops = loop_list[np.argsort(-loop_list)[:num_top_loops]]
589
+ genome_min_loop_value = top_loops[-1]
590
+ top_loop_values[matrix_data_dir + str(num_top_loops)] = genome_min_loop_value
591
+ print('%s top %d loops cutoff value: %f' % (matrix_data_dir, num_top_loops, genome_min_loop_value))
592
+ with open(data_dir + 'top_loop_values.pickle', 'wb') as handle:
593
+ pickle.dump(top_loop_values, handle, protocol=pickle.HIGHEST_PROTOCOL)
594
+
595
+ return genome_min_loop_value
596
+
597
+
598
+ def anchor_list_to_dict(anchors):
599
+ """
600
+ Converts the array of anchor names to a dictionary mapping each anchor to its chromosomal index
601
+
602
+ Args:
603
+ anchors (:obj:`numpy.array`) : array of anchor name values
604
+
605
+ Returns:
606
+ `dict` : dictionary mapping each anchor to its index from the array
607
+ """
608
+ anchor_dict = {}
609
+ for i, anchor in enumerate(anchors):
610
+ anchor_dict[anchor] = i
611
+ return anchor_dict
612
+
613
+
614
+ def anchor_to_locus(anchor_dict):
615
+ """
616
+ Function to convert an anchor name to its genomic locus which can be easily vectorized
617
+
618
+ Args:
619
+ anchor_dict (:obj:`dict`) : dictionary mapping each anchor to its chromosomal index
620
+
621
+ Returns:
622
+ `function` : function which returns the locus of an anchor name
623
+ """
624
+ def f(anchor):
625
+ return anchor_dict[anchor]
626
+ return f
627
+
628
+
629
+ def sorted_nicely(l):
630
+ """
631
+ Sorts an iterable object according to file system defaults
632
+ Args:
633
+ l (:obj:`iterable`) : iterable object containing items which can be interpreted as text
634
+
635
+ Returns:
636
+ `iterable` : sorted iterable
637
+ """
638
+ convert = lambda text: int(text) if text.isdigit() else text
639
+ alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
640
+ return sorted(l, key=alphanum_key)
641
+
642
+
643
+ def normalize_matrix(matrix):
644
+ """
645
+ Normalize ratio values between ``[0, 1]`` using the following function:
646
+
647
+ .. math::
648
+ f(x) = 1 - \\frac{1}{1 + x}
649
+
650
+ .. image:: _static/normalization_function_plot.PNG
651
+ :scale: 100 %
652
+ :align: center
653
+
654
+ Args:
655
+ matrix (:obj:`numpy.array`) : matrix of ratio values
656
+
657
+ Returns:
658
+ ``numpy.array`` : matrix of normalized ratio values between ``[0, 1]``
659
+ """
660
+ return 1 - (1 / (1 + matrix))
661
+
662
+
663
+ def denormalize_matrix(matrix):
664
+ """
665
+ Reverse the normalization of a matrix to set all valid normalized values back to their original ratio values using the following function:
666
+
667
+ .. math::
668
+
669
+ f^{-1}(x) = \\frac{1}{1 - g(x)} - 1 &\\quad \\mbox{where} &\\quad g(x) = \\begin{cases} 0.98, & \\mbox{if } x > 1 \\\\ 0, & \\mbox{if } x < 0 \\\\ x & \\mbox{ otherwise} \\end{cases}
670
+
671
+ We apply the function :math:`g(x)` to remove invalid values that could be in a predicted result and because :math:`f^{-1}(x)` blows up as we approach 1:
672
+
673
+ .. image:: _static/denormalization_function_plot.PNG
674
+ :scale: 100 %
675
+ :align: center
676
+
677
+ Args:
678
+ matrix (:obj:`numpy.array`) : matrix of normalized ratio values
679
+
680
+ Returns:
681
+ ``numpy.array`` : matrix of ratio values
682
+ """
683
+ matrix[matrix > 1] = 0.98
684
+ matrix[matrix < 0] = 0
685
+ return (1 / (1 - matrix)) - 1
686
+
687
+
688
+ def draw_heatmap(matrix, color_scale, ax=None, return_image=False):
689
+ """
690
+ Display ratio heatmap containing only strong signals (values > 1 or 0.98th quantile)
691
+
692
+ Args:
693
+ matrix (:obj:`numpy.array`) : ratio matrix to be displayed
694
+ color_scale (:obj:`int`) : max ratio value to be considered strongest by color mapping
695
+ ax (:obj:`matplotlib.axes.Axes`) : axes which will contain the heatmap. If None, new axes are created
696
+ return_image (:obj:`bool`) : set to True to return the image obtained from drawing the heatmap with the generated color map
697
+
698
+ Returns:
699
+ ``numpy.array`` : if ``return_image`` is set to True, return the heatmap as an array
700
+ """
701
+ if color_scale != 0:
702
+ breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix))
703
+ elif np.max(matrix) < 2:
704
+ breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19)
705
+ else:
706
+ step = (np.quantile(matrix, q=0.95) - 1) / 18
707
+ up = np.quantile(matrix, q=0.95) + 0.011
708
+ if up < 2:
709
+ up = 2
710
+ step = 0.999 / 18
711
+ breaks = np.append(np.arange(1.001, up, step), np.max(matrix))
712
+
713
+ n_bin = 20 # Discretizes the interpolation into bins
714
+ colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686",
715
+ "#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D",
716
+ "#FF0000"]
717
+ cmap_name = 'my_list'
718
+ # Create the colormap
719
+ cm = matplotlib.colors.LinearSegmentedColormap.from_list(
720
+ cmap_name, colors, N=n_bin)
721
+ norm = matplotlib.colors.BoundaryNorm(breaks, 20)
722
+ # Fewer bins will result in "coarser" colomap interpolation
723
+ if ax is None:
724
+ _, ax = plt.subplots()
725
+ img = ax.imshow(matrix, cmap=cm, norm=norm, interpolation='nearest')
726
+ if return_image:
727
+ plt.close()
728
+ return img.get_array()
729
+
730
+
731
+ def get_heatmap(matrix, color_scale):
732
+ if color_scale != 0:
733
+ breaks = np.append(np.arange(1.001, color_scale, (color_scale - 1.001) / 18), np.max(matrix))
734
+ elif np.max(matrix) < 2:
735
+ breaks = np.arange(1.001, np.max(matrix), (np.max(matrix) - 1.001) / 19)
736
+ else:
737
+ step = (np.quantile(matrix, q=0.98) - 1) / 18
738
+ up = np.quantile(matrix, q=0.98) + 0.011
739
+ if up < 2:
740
+ up = 2
741
+ step = 0.999 / 18
742
+ breaks = np.append(np.arange(1.001, up, step), np.max(matrix))
743
+
744
+ n_bin = 20 # Discretizes the interpolation into bins
745
+ colors = ["#FFFFFF", "#FFE4E4", "#FFD7D7", "#FFC9C9", "#FFBCBC", "#FFAEAE", "#FFA1A1", "#FF9494", "#FF8686",
746
+ "#FF7979", "#FF6B6B", "#FF5E5E", "#FF5151", "#FF4343", "#FF3636", "#FF2828", "#FF1B1B", "#FF0D0D",
747
+ "#FF0000"]
748
+ cmap_name = 'my_list'
749
+ # Create the colormap
750
+ cm = matplotlib.colors.LinearSegmentedColormap.from_list(
751
+ cmap_name, colors, N=n_bin)
752
+ norm = matplotlib.colors.BoundaryNorm(breaks, 20)
753
+ # Fewer bins will result in "coarser" colomap interpolation
754
+ m = matplotlib.cm.ScalarMappable(norm=norm, cmap=cm)
755
+ heatmap = m.to_rgba(matrix)
756
+ mask = matrix > 1.2
757
+ heatmap[..., -1] = np.ones_like(mask) * mask
758
+ return heatmap
759
+
760
+
761
+ def save_images_to_video(output_name, out_dir):
762
+ """
763
+ Saves all training visualization images to a video file
764
+
765
+ Args:
766
+ output_name (:obj:`str`) : filename for the saved video file
767
+ """
768
+ image_folder = 'images'
769
+ video_name = out_dir + output_name + '.avi'
770
+
771
+ images = [img for img in sorted(os.listdir(image_folder)) if img.endswith(".png")]
772
+ frame = cv2.imread(os.path.join(image_folder, images[0]))
773
+ height, width, layers = frame.shape
774
+
775
+ video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 29.94, (width, height))
776
+
777
+ for image in images:
778
+ video.write(cv2.imread(os.path.join(image_folder, image)))
779
+
780
+ last_frame = cv2.imread(os.path.join(image_folder, images[-1]))
781
+ for _ in range(150):
782
+ video.write(last_frame)
783
+
784
+ cv2.destroyAllWindows()
785
+ video.release()
786
+
787
+
788
+ def get_model_memory_usage(batch_size, model):
789
+ """
790
+ Estimates the amount of memory required to train the model using the current batch size.
791
+
792
+ Args:
793
+ batch_size (:obj:`int`) : number of training samples in each batch
794
+ model (:obj:`keras.models.Model`) : uncompiled Keras model to be trained
795
+
796
+ Returns:
797
+ ``float`` : estimated memory usage in GB
798
+ """
799
+ shapes_mem_count = 0
800
+ for l in model.layers:
801
+ single_layer_mem = 1
802
+ for s in l.output_shape:
803
+ if s is None:
804
+ continue
805
+ single_layer_mem *= s
806
+ shapes_mem_count += single_layer_mem
807
+
808
+ trainable_count = np.sum([K.count_params(p) for p in set(model.trainable_weights)])
809
+ non_trainable_count = np.sum([K.count_params(p) for p in set(model.non_trainable_weights)])
810
+
811
+ number_size = 4.0
812
+ if K.floatx() == 'float16':
813
+ number_size = 2.0
814
+ if K.floatx() == 'float64':
815
+ number_size = 8.0
816
+
817
+ total_memory = number_size*(batch_size*shapes_mem_count + trainable_count + non_trainable_count)
818
+ gbytes = np.round(total_memory / (1024.0 ** 3), 3)
819
+ return gbytes