Spaces:
Running
Running
File size: 5,408 Bytes
0d0c645 66c57f6 a5afc1a c0e089e a5afc1a c0e089e adf804d c0e089e a5afc1a c0e089e a5afc1a c0e089e a5afc1a c0e089e 0d0c645 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import argparse
# configure GPUs
for gpu in tf.config.list_physical_devices('GPU'):
tf.config.experimental.set_memory_growth(gpu, enable=True)
if len(tf.config.list_physical_devices('GPU')) > 0:
tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
class Encoder:
def __init__(self, on_seq, off_seq, with_category = False, label = None, with_reg_val = False, value = None):
tlen = 24
self.on_seq = "-" *(tlen-len(on_seq)) + on_seq
self.off_seq = "-" *(tlen-len(off_seq)) + off_seq
self.encoded_dict_indel = {'A': [1, 0, 0, 0, 0], 'T': [0, 1, 0, 0, 0],
'G': [0, 0, 1, 0, 0], 'C': [0, 0, 0, 1, 0], '_': [0, 0, 0, 0, 1], '-': [0, 0, 0, 0, 0]}
self.direction_dict = {'A':5, 'G':4, 'C':3, 'T':2, '_':1}
if with_category:
self.label = label
if with_reg_val:
self.value = value
self.encode_on_off_dim7()
def encode_sgRNA(self):
code_list = []
encoded_dict = self.encoded_dict_indel
sgRNA_bases = list(self.on_seq)
for i in range(len(sgRNA_bases)):
if sgRNA_bases[i] == "N":
sgRNA_bases[i] = list(self.off_seq)[i]
code_list.append(encoded_dict[sgRNA_bases[i]])
self.sgRNA_code = np.array(code_list)
def encode_off(self):
code_list = []
encoded_dict = self.encoded_dict_indel
off_bases = list(self.off_seq)
for i in range(len(off_bases)):
code_list.append(encoded_dict[off_bases[i]])
self.off_code = np.array(code_list)
def encode_on_off_dim7(self):
self.encode_sgRNA()
self.encode_off()
on_bases = list(self.on_seq)
off_bases = list(self.off_seq)
on_off_dim7_codes = []
for i in range(len(on_bases)):
diff_code = np.bitwise_or(self.sgRNA_code[i], self.off_code[i])
on_b = on_bases[i]
off_b = off_bases[i]
if on_b == "N":
on_b = off_b
dir_code = np.zeros(2)
if on_b == "-" or off_b == "-" or self.direction_dict[on_b] == self.direction_dict[off_b]:
pass
else:
if self.direction_dict[on_b] > self.direction_dict[off_b]:
dir_code[0] = 1
else:
dir_code[1] = 1
on_off_dim7_codes.append(np.concatenate((diff_code, dir_code)))
self.on_off_code = np.array(on_off_dim7_codes)
def encode_on_off_seq_pairs(input_file):
inputs = pd.read_csv(input_file, delimiter=",", header=None, names=['on_seq', 'off_seq'])
input_codes = []
for idx, row in inputs.iterrows():
on_seq = row['on_seq']
off_seq = row['off_seq']
en = Encoder(on_seq=on_seq, off_seq=off_seq)
input_codes.append(en.on_off_code)
input_codes = np.array(input_codes)
input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
y_pred = CRISPR_net_predict(input_codes)
inputs['CRISPR_Net_score'] = y_pred
inputs.to_csv("CRISPR_net_results.csv", index=False)
def CRISPR_net_predict(X_test):
json_file = open("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = tf.keras.models.model_from_json(loaded_model_json) # Updated for TensorFlow 2
loaded_model.load_weights("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5")
y_pred = loaded_model.predict(X_test).flatten()
return y_pred
def process_input_and_predict(input_data, input_type='manual'):
if input_type == 'manual':
sequences = [seq.split(',') for seq in input_data.split('\n')]
inputs = pd.DataFrame(sequences, columns=['on_seq', 'off_seq'])
elif input_type == 'file':
inputs = pd.read_csv(input_data, delimiter=",", header=None, names=['on_seq', 'off_seq'])
valid_inputs = []
input_codes = []
for idx, row in inputs.iterrows():
on_seq = row['on_seq']
off_seq = row['off_seq']
if not on_seq or not off_seq:
continue
en = Encoder(on_seq=on_seq, off_seq=off_seq)
input_codes.append(en.on_off_code)
valid_inputs.append((on_seq, off_seq))
input_codes = np.array(input_codes)
input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
y_pred = CRISPR_net_predict(input_codes)
# Create a new DataFrame from valid inputs and predictions
result_df = pd.DataFrame(valid_inputs, columns=['on_seq', 'off_seq'])
result_df['CRISPR_Net_score'] = y_pred
return result_df
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="CRISPR-Net v1.0 (Aug 10 2019)")
parser.add_argument("input_file",
help="input_file example (on-target seq, off-target seq):\n GAGT_CCGAGCAGAAGAAGAATGG,GAGTACCAAGTAGAAGAAAAATTT\n"
"GTTGCCCCACAGGGCAGTAAAGG,GTGGACACCCCGGGCAGGAAAGG\n"
"GGGTGGGGGGAGTTTGCTCCAGG,AGGTGGGGTGA_TTTGCTCCAGG")
args = parser.parse_args()
file = args.input_file
if not os.path.exists(args.input_file):
print("File doesn't exist!")
else:
encode_on_off_seq_pairs(file)
tf.keras.backend.clear_session() |