Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Andrew Stirn
commited on
Commit
·
1ef81e0
1
Parent(s):
eac7d3f
off target scanning
Browse files
tiger.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
import pandas as pd
|
|
|
4 |
from Bio import SeqIO
|
5 |
|
6 |
GUIDE_LEN = 23
|
@@ -78,13 +79,68 @@ def tiger_predict(transcript_seq: str):
|
|
78 |
# get predictions
|
79 |
normalized_lfc = tiger.predict_step(model_inputs)
|
80 |
predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
|
|
|
81 |
|
82 |
return predictions
|
83 |
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
if __name__ == '__main__':
|
86 |
|
87 |
# simple test case
|
88 |
-
transcript_sequence = '
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import numpy as np
|
3 |
import pandas as pd
|
4 |
+
import tensorflow as tf
|
5 |
from Bio import SeqIO
|
6 |
|
7 |
GUIDE_LEN = 23
|
|
|
79 |
# get predictions
|
80 |
normalized_lfc = tiger.predict_step(model_inputs)
|
81 |
predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
|
82 |
+
predictions = predictions.set_index('Guide').sort_values('Normalized LFC')
|
83 |
|
84 |
return predictions
|
85 |
|
86 |
|
87 |
+
def find_off_targets(guides, batch_size=1000):
|
88 |
+
with open('gencode.v19.pc_transcripts.fa', 'r') as file:
|
89 |
+
df_transcripts = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=['id', 'seq'])
|
90 |
+
df_transcripts['id'] = df_transcripts['id'].apply(lambda s: s.split('|')[4])
|
91 |
+
df_transcripts.set_index('id', inplace=True)
|
92 |
+
|
93 |
+
# one-hot encode guides to form a filter
|
94 |
+
guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
|
95 |
+
guide_filter = tf.transpose(guide_filter, [1, 2, 0])
|
96 |
+
|
97 |
+
# loop over transcripts in batches
|
98 |
+
i = 0
|
99 |
+
print('Scanning for off-targets')
|
100 |
+
df_off_targets = pd.DataFrame()
|
101 |
+
while i < len(df_transcripts):
|
102 |
+
# select batch
|
103 |
+
df_batch = df_transcripts.iloc[i:min(i + batch_size, len(df_transcripts))]
|
104 |
+
i += batch_size
|
105 |
+
|
106 |
+
# find and log off-targets
|
107 |
+
transcripts = one_hot_encode_sequence(df_batch['seq'].values.tolist(), add_context_padding=False)
|
108 |
+
num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
|
109 |
+
loc_off_targets = tf.where(num_mismatches <= NUM_MISMATCHES).numpy()
|
110 |
+
df_off_targets = pd.concat([df_off_targets, pd.DataFrame({
|
111 |
+
'Guide': np.array(guides)[loc_off_targets[:, 2]],
|
112 |
+
'Isoform': df_batch.index.values[loc_off_targets[:, 0]],
|
113 |
+
'Mismatches': tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
|
114 |
+
'Midpoint': loc_off_targets[:, 1],
|
115 |
+
'Target': df_batch['seq'].values[loc_off_targets[:, 0]],
|
116 |
+
})])
|
117 |
+
|
118 |
+
# progress update
|
119 |
+
print('\rPercent complete: {:.2f}%'.format(100 * min(i / len(df_transcripts), 1)), end='')
|
120 |
+
print('')
|
121 |
+
|
122 |
+
# trim transcripts to targets
|
123 |
+
dict_off_targets = df_off_targets.to_dict('records')
|
124 |
+
for row in dict_off_targets:
|
125 |
+
start_location = row['Midpoint'] - (GUIDE_LEN // 2) - CONTEXT_5P
|
126 |
+
row['Target'] = row['Target'][start_location:start_location + TARGET_LEN]
|
127 |
+
if row['Mismatches'] == 0:
|
128 |
+
assert row['Guide'] == sequence_complement([row['Target'][CONTEXT_5P:TARGET_LEN-CONTEXT_3P]])[0]
|
129 |
+
df_off_targets = pd.DataFrame(dict_off_targets)
|
130 |
+
|
131 |
+
return df_off_targets
|
132 |
+
|
133 |
+
|
134 |
if __name__ == '__main__':
|
135 |
|
136 |
# simple test case
|
137 |
+
transcript_sequence = 'ATGCAGGACGCGGAGAACGTGGCGGTGCCCGAGGCGGCCGAGGAGCGCGC'.lower() # first 50 from EIF3B-003's CDS
|
138 |
+
sorted_predictions = tiger_predict(transcript_sequence)
|
139 |
+
|
140 |
+
# report top guides only
|
141 |
+
sorted_predictions = sorted_predictions.iloc[:NUM_TOP_GUIDES]
|
142 |
+
print(sorted_predictions)
|
143 |
+
|
144 |
+
# scan for off-targets for top guides
|
145 |
+
off_targets = find_off_targets(sorted_predictions.index.values.tolist())
|
146 |
+
print(off_targets)
|