Andrew Stirn commited on
Commit
457a981
·
1 Parent(s): a1b3810

off-target model with guide sequence utilization

Browse files
model/fingerprint.pb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01a50341063d589fc4efcbb7dc7354318f9dcdba65575e608759284dcc0d8162
3
- size 53
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52d2b657d4a87fe128786cd8435a2f4c8d4e5d08571b12ff8911f100c0ee043b
3
+ size 54
model/keras_metadata.pb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c7badc5998ecd142564cb70002b001ee812d404f4ac30976bb33c1233ab898a
3
- size 13592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e89af418a7cbb78442c6a65f20f2817352361b826189c4aad0de8a531aa5a8d
3
+ size 13629
model/saved_model.pb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c94e1de45290f8663320886419bd4cf611aa7fa00fce146bc0d96d35b8b5e39
3
- size 214038
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f91692a0db6169ce09c321d292a7622468ea1b46c2f2293d97e43aa7c9cb9719
3
+ size 241848
model/variables/variables.data-00000-of-00001 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96f573e7920d24eacd8c00c32f2995392f038629a4ce5ee27d6454448025276e
3
- size 522375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff1487ef1c93444ea6eeb6b023a0f4095aa0af473d98022d8c6e8b9e339d0add
3
+ size 948103
model/variables/variables.index CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:612540024fe115acd056ffc34e9d73b223a14c8620f06c6caee871a3a61f8941
3
- size 869
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f00477f5d3801ba7e566cfcb16d16970ee88a94b716db4232e30ac27115bbfbd
3
+ size 877
tiger.py CHANGED
@@ -7,6 +7,7 @@ CONTEXT_5P = 3
7
  CONTEXT_3P = 0
8
  TARGET_LEN = CONTEXT_5P + GUIDE_LEN + CONTEXT_3P
9
  NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T'], [0, 1, 2, 3]))
 
10
 
11
 
12
  def process_data(transcript_seq: str):
@@ -17,16 +18,29 @@ def process_data(transcript_seq: str):
17
  # get all target sites
18
  target_seq = [transcript_seq[i: i + TARGET_LEN] for i in range(len(transcript_seq) - TARGET_LEN)]
19
 
20
- # get one-hot encodings
 
 
 
 
21
  nucleotide_table = tf.lookup.StaticVocabularyTable(
22
  initializer=tf.lookup.KeyValueTensorInitializer(
23
  keys=tf.constant(list(NUCLEOTIDE_TOKENS.keys()), dtype=tf.string),
24
  values=tf.constant(list(NUCLEOTIDE_TOKENS.values()), dtype=tf.int64)),
25
  num_oov_buckets=1)
26
  target_tokens = nucleotide_table.lookup(tf.stack([list(t) for t in target_seq], axis=0))
27
- target_one_hot = tf.reshape(tf.one_hot(target_tokens, depth=4), [len(target_seq), -1])
 
 
 
 
 
 
 
 
 
28
 
29
- return target_seq, target_one_hot
30
 
31
 
32
  def tiger_predict(transcript_seq: str):
@@ -38,12 +52,12 @@ def tiger_predict(transcript_seq: str):
38
  print('no saved model!')
39
  exit()
40
 
41
- # parse transcript sequence into 23-nt target sequences and their one-hot encodings
42
- target_seq, target_seq_one_hot = process_data(transcript_seq)
43
 
44
  # get predictions
45
- normalized_lfc = tiger.predict_step(target_seq_one_hot)
46
- predictions = pd.DataFrame({'Target site': target_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
47
 
48
  return predictions
49
 
 
7
  CONTEXT_3P = 0
8
  TARGET_LEN = CONTEXT_5P + GUIDE_LEN + CONTEXT_3P
9
  NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T'], [0, 1, 2, 3]))
10
+ NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))
11
 
12
 
13
  def process_data(transcript_seq: str):
 
18
  # get all target sites
19
  target_seq = [transcript_seq[i: i + TARGET_LEN] for i in range(len(transcript_seq) - TARGET_LEN)]
20
 
21
+ # prepare guide sequences
22
+ guide_seq = [seq[CONTEXT_5P:len(seq) - CONTEXT_3P] for seq in target_seq]
23
+ guide_seq = [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in guide_seq]
24
+
25
+ # tokenize sequence
26
  nucleotide_table = tf.lookup.StaticVocabularyTable(
27
  initializer=tf.lookup.KeyValueTensorInitializer(
28
  keys=tf.constant(list(NUCLEOTIDE_TOKENS.keys()), dtype=tf.string),
29
  values=tf.constant(list(NUCLEOTIDE_TOKENS.values()), dtype=tf.int64)),
30
  num_oov_buckets=1)
31
  target_tokens = nucleotide_table.lookup(tf.stack([list(t) for t in target_seq], axis=0))
32
+ guide_tokens = nucleotide_table.lookup(tf.stack([list(g) for g in guide_seq], axis=0))
33
+ pad_5p = 255 * tf.ones([guide_tokens.shape[0], CONTEXT_5P], dtype=guide_tokens.dtype)
34
+ pad_3p = 255 * tf.ones([guide_tokens.shape[0], CONTEXT_3P], dtype=guide_tokens.dtype)
35
+ guide_tokens = tf.concat([pad_5p, guide_tokens, pad_3p], axis=1)
36
+
37
+ # model inputs
38
+ model_inputs = tf.concat([
39
+ tf.reshape(tf.one_hot(target_tokens, depth=4), [len(target_seq), -1]),
40
+ tf.reshape(tf.one_hot(guide_tokens, depth=4), [len(guide_tokens), -1]),
41
+ ], axis=-1)
42
 
43
+ return target_seq, guide_seq, model_inputs
44
 
45
 
46
  def tiger_predict(transcript_seq: str):
 
52
  print('no saved model!')
53
  exit()
54
 
55
+ # parse transcript sequence
56
+ target_seq, guide_seq, model_inputs = process_data(transcript_seq)
57
 
58
  # get predictions
59
+ normalized_lfc = tiger.predict_step(model_inputs)
60
+ predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
61
 
62
  return predictions
63