File size: 5,408 Bytes
0d0c645
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66c57f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5afc1a
c0e089e
 
 
 
 
 
 
a5afc1a
c0e089e
 
 
 
adf804d
 
 
c0e089e
 
a5afc1a
c0e089e
 
 
 
a5afc1a
 
 
 
 
c0e089e
a5afc1a
c0e089e
0d0c645
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import argparse

# configure GPUs
for gpu in tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, enable=True)
if len(tf.config.list_physical_devices('GPU')) > 0:
    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')

class Encoder:
    def __init__(self, on_seq, off_seq, with_category = False, label = None, with_reg_val = False, value = None):
        tlen = 24
        self.on_seq = "-" *(tlen-len(on_seq)) +  on_seq
        self.off_seq = "-" *(tlen-len(off_seq)) + off_seq
        self.encoded_dict_indel = {'A': [1, 0, 0, 0, 0], 'T': [0, 1, 0, 0, 0],
                                   'G': [0, 0, 1, 0, 0], 'C': [0, 0, 0, 1, 0], '_': [0, 0, 0, 0, 1], '-': [0, 0, 0, 0, 0]}
        self.direction_dict = {'A':5, 'G':4, 'C':3, 'T':2, '_':1}
        if with_category:
            self.label = label
        if with_reg_val:
            self.value = value
        self.encode_on_off_dim7()

    def encode_sgRNA(self):
        code_list = []
        encoded_dict = self.encoded_dict_indel
        sgRNA_bases = list(self.on_seq)
        for i in range(len(sgRNA_bases)):
            if sgRNA_bases[i] == "N":
                sgRNA_bases[i] = list(self.off_seq)[i]
            code_list.append(encoded_dict[sgRNA_bases[i]])
        self.sgRNA_code = np.array(code_list)

    def encode_off(self):
        code_list = []
        encoded_dict = self.encoded_dict_indel
        off_bases = list(self.off_seq)
        for i in range(len(off_bases)):
            code_list.append(encoded_dict[off_bases[i]])
        self.off_code = np.array(code_list)

    def encode_on_off_dim7(self):
        self.encode_sgRNA()
        self.encode_off()
        on_bases = list(self.on_seq)
        off_bases = list(self.off_seq)
        on_off_dim7_codes = []
        for i in range(len(on_bases)):
            diff_code = np.bitwise_or(self.sgRNA_code[i], self.off_code[i])
            on_b = on_bases[i]
            off_b = off_bases[i]
            if on_b == "N":
                on_b = off_b
            dir_code = np.zeros(2)
            if on_b == "-" or off_b == "-" or self.direction_dict[on_b] == self.direction_dict[off_b]:
                pass
            else:
                if self.direction_dict[on_b] > self.direction_dict[off_b]:
                    dir_code[0] = 1
                else:
                    dir_code[1] = 1
            on_off_dim7_codes.append(np.concatenate((diff_code, dir_code)))
        self.on_off_code = np.array(on_off_dim7_codes)

def encode_on_off_seq_pairs(input_file):
    inputs = pd.read_csv(input_file, delimiter=",", header=None, names=['on_seq', 'off_seq'])
    input_codes = []
    for idx, row in inputs.iterrows():
        on_seq = row['on_seq']
        off_seq = row['off_seq']
        en = Encoder(on_seq=on_seq, off_seq=off_seq)
        input_codes.append(en.on_off_code)
    input_codes = np.array(input_codes)
    input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
    y_pred = CRISPR_net_predict(input_codes)
    inputs['CRISPR_Net_score'] = y_pred
    inputs.to_csv("CRISPR_net_results.csv", index=False)

def CRISPR_net_predict(X_test):
    json_file = open("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = tf.keras.models.model_from_json(loaded_model_json)  # Updated for TensorFlow 2
    loaded_model.load_weights("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5")
    y_pred = loaded_model.predict(X_test).flatten()
    return y_pred


def process_input_and_predict(input_data, input_type='manual'):
    if input_type == 'manual':
        sequences = [seq.split(',') for seq in input_data.split('\n')]
        inputs = pd.DataFrame(sequences, columns=['on_seq', 'off_seq'])
    elif input_type == 'file':
        inputs = pd.read_csv(input_data, delimiter=",", header=None, names=['on_seq', 'off_seq'])

    valid_inputs = []
    input_codes = []
    for idx, row in inputs.iterrows():
        on_seq = row['on_seq']
        off_seq = row['off_seq']
        if not on_seq or not off_seq:
            continue

        en = Encoder(on_seq=on_seq, off_seq=off_seq)
        input_codes.append(en.on_off_code)
        valid_inputs.append((on_seq, off_seq))

    input_codes = np.array(input_codes)
    input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))

    y_pred = CRISPR_net_predict(input_codes)

    # Create a new DataFrame from valid inputs and predictions
    result_df = pd.DataFrame(valid_inputs, columns=['on_seq', 'off_seq'])
    result_df['CRISPR_Net_score'] = y_pred

    return result_df

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="CRISPR-Net v1.0 (Aug 10 2019)")
    parser.add_argument("input_file",
                        help="input_file example (on-target seq, off-target seq):\n GAGT_CCGAGCAGAAGAAGAATGG,GAGTACCAAGTAGAAGAAAAATTT\n"
                             "GTTGCCCCACAGGGCAGTAAAGG,GTGGACACCCCGGGCAGGAAAGG\n"
                             "GGGTGGGGGGAGTTTGCTCCAGG,AGGTGGGGTGA_TTTGCTCCAGG")
    args = parser.parse_args()
    file = args.input_file
    if not os.path.exists(args.input_file):
        print("File doesn't exist!")
    else:
        encode_on_off_seq_pairs(file)
        tf.keras.backend.clear_session()