Andrew Stirn commited on
Commit
1ef81e0
·
1 Parent(s): eac7d3f

off target scanning

Browse files
Files changed (1) hide show
  1. tiger.py +60 -4
tiger.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
- import tensorflow as tf
3
  import pandas as pd
 
4
  from Bio import SeqIO
5
 
6
  GUIDE_LEN = 23
@@ -78,13 +79,68 @@ def tiger_predict(transcript_seq: str):
78
  # get predictions
79
  normalized_lfc = tiger.predict_step(model_inputs)
80
  predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
 
81
 
82
  return predictions
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  if __name__ == '__main__':
86
 
87
  # simple test case
88
- transcript_sequence = 'ACGTACGTACGTACGTACGTACGTACGTACGT'.lower()
89
- df = tiger_predict(transcript_sequence)
90
- print(df)
 
 
 
 
 
 
 
 
1
  import os
2
+ import numpy as np
3
  import pandas as pd
4
+ import tensorflow as tf
5
  from Bio import SeqIO
6
 
7
  GUIDE_LEN = 23
 
79
  # get predictions
80
  normalized_lfc = tiger.predict_step(model_inputs)
81
  predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
82
+ predictions = predictions.set_index('Guide').sort_values('Normalized LFC')
83
 
84
  return predictions
85
 
86
 
87
+ def find_off_targets(guides, batch_size=1000):
88
+ with open('gencode.v19.pc_transcripts.fa', 'r') as file:
89
+ df_transcripts = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=['id', 'seq'])
90
+ df_transcripts['id'] = df_transcripts['id'].apply(lambda s: s.split('|')[4])
91
+ df_transcripts.set_index('id', inplace=True)
92
+
93
+ # one-hot encode guides to form a filter
94
+ guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
95
+ guide_filter = tf.transpose(guide_filter, [1, 2, 0])
96
+
97
+ # loop over transcripts in batches
98
+ i = 0
99
+ print('Scanning for off-targets')
100
+ df_off_targets = pd.DataFrame()
101
+ while i < len(df_transcripts):
102
+ # select batch
103
+ df_batch = df_transcripts.iloc[i:min(i + batch_size, len(df_transcripts))]
104
+ i += batch_size
105
+
106
+ # find and log off-targets
107
+ transcripts = one_hot_encode_sequence(df_batch['seq'].values.tolist(), add_context_padding=False)
108
+ num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
109
+ loc_off_targets = tf.where(num_mismatches <= NUM_MISMATCHES).numpy()
110
+ df_off_targets = pd.concat([df_off_targets, pd.DataFrame({
111
+ 'Guide': np.array(guides)[loc_off_targets[:, 2]],
112
+ 'Isoform': df_batch.index.values[loc_off_targets[:, 0]],
113
+ 'Mismatches': tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
114
+ 'Midpoint': loc_off_targets[:, 1],
115
+ 'Target': df_batch['seq'].values[loc_off_targets[:, 0]],
116
+ })])
117
+
118
+ # progress update
119
+ print('\rPercent complete: {:.2f}%'.format(100 * min(i / len(df_transcripts), 1)), end='')
120
+ print('')
121
+
122
+ # trim transcripts to targets
123
+ dict_off_targets = df_off_targets.to_dict('records')
124
+ for row in dict_off_targets:
125
+ start_location = row['Midpoint'] - (GUIDE_LEN // 2) - CONTEXT_5P
126
+ row['Target'] = row['Target'][start_location:start_location + TARGET_LEN]
127
+ if row['Mismatches'] == 0:
128
+ assert row['Guide'] == sequence_complement([row['Target'][CONTEXT_5P:TARGET_LEN-CONTEXT_3P]])[0]
129
+ df_off_targets = pd.DataFrame(dict_off_targets)
130
+
131
+ return df_off_targets
132
+
133
+
134
  if __name__ == '__main__':
135
 
136
  # simple test case
137
+ transcript_sequence = 'ATGCAGGACGCGGAGAACGTGGCGGTGCCCGAGGCGGCCGAGGAGCGCGC'.lower() # first 50 from EIF3B-003's CDS
138
+ sorted_predictions = tiger_predict(transcript_sequence)
139
+
140
+ # report top guides only
141
+ sorted_predictions = sorted_predictions.iloc[:NUM_TOP_GUIDES]
142
+ print(sorted_predictions)
143
+
144
+ # scan for off-targets for top guides
145
+ off_targets = find_off_targets(sorted_predictions.index.values.tolist())
146
+ print(off_targets)