Spaces:

Knowles-Lab
/

tiger

Running on CPU Upgrade

App Files Files Community

Andrew Stirn commited on Jan 4, 2023

Commit

457a981

1 Parent(s): a1b3810

off-target model with guide sequence utilization

Browse files

Files changed (6) hide show

model/fingerprint.pb +2 -2
model/keras_metadata.pb +2 -2
model/saved_model.pb +2 -2
model/variables/variables.data-00000-of-00001 +2 -2
model/variables/variables.index +2 -2
tiger.py +21 -7

model/fingerprint.pb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:01a50341063d589fc4efcbb7dc7354318f9dcdba65575e608759284dcc0d8162
-size 53

 version https://git-lfs.github.com/spec/v1
+oid sha256:52d2b657d4a87fe128786cd8435a2f4c8d4e5d08571b12ff8911f100c0ee043b
+size 54

model/keras_metadata.pb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c7badc5998ecd142564cb70002b001ee812d404f4ac30976bb33c1233ab898a
-size 13592

 version https://git-lfs.github.com/spec/v1
+oid sha256:8e89af418a7cbb78442c6a65f20f2817352361b826189c4aad0de8a531aa5a8d
+size 13629

model/saved_model.pb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c94e1de45290f8663320886419bd4cf611aa7fa00fce146bc0d96d35b8b5e39
-size 214038

 version https://git-lfs.github.com/spec/v1
+oid sha256:f91692a0db6169ce09c321d292a7622468ea1b46c2f2293d97e43aa7c9cb9719
+size 241848

model/variables/variables.data-00000-of-00001 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96f573e7920d24eacd8c00c32f2995392f038629a4ce5ee27d6454448025276e
-size 522375

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff1487ef1c93444ea6eeb6b023a0f4095aa0af473d98022d8c6e8b9e339d0add
+size 948103

model/variables/variables.index CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:612540024fe115acd056ffc34e9d73b223a14c8620f06c6caee871a3a61f8941
-size 869

 version https://git-lfs.github.com/spec/v1
+oid sha256:f00477f5d3801ba7e566cfcb16d16970ee88a94b716db4232e30ac27115bbfbd
+size 877

tiger.py CHANGED Viewed

@@ -7,6 +7,7 @@ CONTEXT_5P = 3
 CONTEXT_3P = 0
 TARGET_LEN = CONTEXT_5P + GUIDE_LEN + CONTEXT_3P
 NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T'], [0, 1, 2, 3]))
 def process_data(transcript_seq: str):
@@ -17,16 +18,29 @@ def process_data(transcript_seq: str):
     # get all target sites
     target_seq = [transcript_seq[i: i + TARGET_LEN] for i in range(len(transcript_seq) - TARGET_LEN)]
-    # get one-hot encodings
     nucleotide_table = tf.lookup.StaticVocabularyTable(
         initializer=tf.lookup.KeyValueTensorInitializer(
             keys=tf.constant(list(NUCLEOTIDE_TOKENS.keys()), dtype=tf.string),
             values=tf.constant(list(NUCLEOTIDE_TOKENS.values()), dtype=tf.int64)),
         num_oov_buckets=1)
     target_tokens = nucleotide_table.lookup(tf.stack([list(t) for t in target_seq], axis=0))
-    target_one_hot = tf.reshape(tf.one_hot(target_tokens, depth=4), [len(target_seq), -1])
-    return target_seq, target_one_hot
 def tiger_predict(transcript_seq: str):
@@ -38,12 +52,12 @@ def tiger_predict(transcript_seq: str):
         print('no saved model!')
         exit()
-    # parse transcript sequence into 23-nt target sequences and their one-hot encodings
-    target_seq, target_seq_one_hot = process_data(transcript_seq)
     # get predictions
-    normalized_lfc = tiger.predict_step(target_seq_one_hot)
-    predictions = pd.DataFrame({'Target site': target_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
     return predictions

 CONTEXT_3P = 0
 TARGET_LEN = CONTEXT_5P + GUIDE_LEN + CONTEXT_3P
 NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T'], [0, 1, 2, 3]))
+NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))
 def process_data(transcript_seq: str):
     # get all target sites
     target_seq = [transcript_seq[i: i + TARGET_LEN] for i in range(len(transcript_seq) - TARGET_LEN)]
+    # prepare guide sequences
+    guide_seq = [seq[CONTEXT_5P:len(seq) - CONTEXT_3P] for seq in target_seq]
+    guide_seq = [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in guide_seq]
+    # tokenize sequence
     nucleotide_table = tf.lookup.StaticVocabularyTable(
         initializer=tf.lookup.KeyValueTensorInitializer(
             keys=tf.constant(list(NUCLEOTIDE_TOKENS.keys()), dtype=tf.string),
             values=tf.constant(list(NUCLEOTIDE_TOKENS.values()), dtype=tf.int64)),
         num_oov_buckets=1)
     target_tokens = nucleotide_table.lookup(tf.stack([list(t) for t in target_seq], axis=0))
+    guide_tokens = nucleotide_table.lookup(tf.stack([list(g) for g in guide_seq], axis=0))
+    pad_5p = 255 * tf.ones([guide_tokens.shape[0], CONTEXT_5P], dtype=guide_tokens.dtype)
+    pad_3p = 255 * tf.ones([guide_tokens.shape[0], CONTEXT_3P], dtype=guide_tokens.dtype)
+    guide_tokens = tf.concat([pad_5p, guide_tokens, pad_3p], axis=1)
+    # model inputs
+    model_inputs = tf.concat([
+        tf.reshape(tf.one_hot(target_tokens, depth=4), [len(target_seq), -1]),
+        tf.reshape(tf.one_hot(guide_tokens, depth=4), [len(guide_tokens), -1]),
+        ], axis=-1)
+    return target_seq, guide_seq, model_inputs
 def tiger_predict(transcript_seq: str):
         print('no saved model!')
         exit()
+    # parse transcript sequence
+    target_seq, guide_seq, model_inputs = process_data(transcript_seq)
     # get predictions
+    normalized_lfc = tiger.predict_step(model_inputs)
+    predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
     return predictions