| import argparse |
| import csv |
| import os |
| import random |
| import numpy as np |
| from process_pretrain_data import get_kmer_sentence |
|
|
| max_length = 0 |
|
|
| def write_file(lines, path, kmer, head=True, seq_index=0, label_index=1): |
| with open(path, 'wt') as f: |
| tsv_w = csv.writer(f, delimiter='\t') |
| if head: |
| tsv_w.writerow(["setence", "label"]) |
| for line in lines: |
| if kmer == 0: |
| sentence = str(line[seq_index]) |
| else: |
| sentence = str(get_kmer_sentence("".join(line[seq_index].split()), kmer)) |
| if label_index == None: |
| label = "0" |
| else: |
| label = str(line[label_index]) |
| tsv_w.writerow([sentence, label]) |
|
|
|
|
| def Shuffle(args): |
| old_file = open(args.file_path, "r", encoding="utf-8-sig") |
| old_lines = list(csv.reader(old_file, delimiter="\t", quotechar=None))[1:] |
| random.shuffle(old_lines) |
|
|
| write_file(old_lines, args.file_path, 0) |
|
|
| def Find_train(args): |
| random.seed(args.seed) |
|
|
| tata = args.file_path + "/TATA_249to50.tsv" |
| notata = args.file_path + "/noTATA_249to50.tsv" |
| tata_file = open(tata, "r", encoding="utf-8-sig") |
| notata_file = open(notata, "r", encoding="utf-8-sig") |
| tata_lines = list(csv.reader(tata_file, delimiter="\t", quotechar=None))[1:] |
| notata_lines = list(csv.reader(notata_file, delimiter="\t", quotechar=None))[1:] |
|
|
| tata_test = args.file_path + "/tata_test.tsv" |
| notata_test = args.file_path + "/notata_test.tsv" |
| tata_test_file = open(tata_test, "r", encoding="utf-8-sig") |
| notata_test_file = open(notata_test, "r", encoding="utf-8-sig") |
| tata_test_lines = list(csv.reader(tata_test_file, delimiter="\t", quotechar=None))[1:] |
| notata_test_lines = list(csv.reader(notata_test_file, delimiter="\t", quotechar=None))[1:] |
|
|
| |
| train_lines = [] |
|
|
| for line in tata_lines: |
| if [line[0], line[1]] not in tata_test_lines: |
| train_lines.append([line[0], line[1]]) |
| |
|
|
| for line in notata_lines: |
| if [line[0], line[1]] not in notata_test_lines: |
| train_lines.append([line[0], line[1]]) |
| |
| random.shuffle(train_lines) |
| random.shuffle(train_lines) |
|
|
| |
| |
| |
|
|
| |
| write_file(train_lines, args.file_path+"/train.tsv", args.kmer, head=False) |
| |
|
|
| for kmer in range(3,7): |
| root_path = os.path.join(args.file_path, str(kmer)) |
| if not os.path.exists(root_path): |
| os.makedirs(root_path) |
|
|
| train_file = open(os.path.join(args.file_path,"train.tsv"), "r", encoding="utf-8-sig") |
| lines = list(csv.reader(train_file, delimiter="\t", quotechar=None)) |
| train_path = os.path.join(root_path,"train.tsv") |
|
|
| write_file(lines, train_path, kmer) |
|
|
| tata_path = os.path.join(root_path, "tata") |
| notata_path = os.path.join(root_path, "notata") |
| os.makedirs(tata_path) |
| os.makedirs(notata_path) |
|
|
| dev_lines = tata_test_lines+notata_test_lines |
| dev_path = os.path.join(root_path,"dev.tsv") |
|
|
| write_file(tata_test_lines, os.path.join(tata_path, "dev.tsv"), kmer) |
| write_file(notata_test_lines, os.path.join(notata_path, "dev.tsv"), kmer) |
| write_file(dev_lines, dev_path, kmer) |
|
|
| def Process_1000(args): |
| random.seed(args.seed) |
|
|
| tata_train = args.file_path + "TATA_scan_train.csv" |
| notata_train = args.file_path + "noTATA_scan_train.csv" |
| tata_train_file = open(tata_train, "r", encoding="utf-8-sig") |
| notata_train_file = open(notata_train, "r", encoding="utf-8-sig") |
| tata_train_lines = list(csv.reader(tata_train_file, delimiter=",", quotechar=None))[1:] |
| notata_train_lines = list(csv.reader(notata_train_file, delimiter=",", quotechar=None))[1:] |
|
|
| tata_test = args.file_path + "/TATA_scan_test.csv" |
| notata_test = args.file_path + "/noTATA_scan_test.csv" |
| tata_test_file = open(tata_test, "r", encoding="utf-8-sig") |
| notata_test_file = open(notata_test, "r", encoding="utf-8-sig") |
| tata_test_lines = list(csv.reader(tata_test_file, delimiter=",", quotechar=None))[1:] |
| notata_test_lines = list(csv.reader(notata_test_file, delimiter=",", quotechar=None))[1:] |
| |
|
|
| print("Original:") |
| print("tata train: %d" % (len(tata_train_lines))) |
| print("notata train: %d" % (len(notata_train_lines))) |
| print("tata test: %d" % (len(tata_test_lines))) |
| print("tata test: %d" % (len(notata_test_lines))) |
|
|
| random.shuffle(tata_train_lines) |
| random.shuffle(notata_train_lines) |
| random.shuffle(tata_test_lines) |
| random.shuffle(notata_test_lines) |
|
|
| |
| notata_train_lines = notata_train_lines[:len(tata_train_lines)] |
| notata_test_lines = notata_test_lines[:len(tata_test_lines)] |
| with open(os.path.join(args.file_path, "notata_test_id"), "w") as f: |
| tsv_w = csv.writer(f, delimiter=',') |
| tsv_w.writerow(["index", "chrom", "start", "end", "name", "strand", "keys", "id"]) |
| for line in notata_test_lines: |
| tsv_w.writerow([line[0], line[1], line[2], line[3], line[4], line[5], line[7], line[9]]) |
|
|
|
|
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
|
|
| def Process_1000_kmer(args, test_lines=None, train_lines=None, tata_test_lines=None, tata_train_lines=None, notata_test_lines=None, notata_train_lines=None): |
| |
| LOAD = True |
| output_path = args.output_path if args.output_path is not None else args.file_path |
|
|
| if test_lines == None: |
| path1 = os.path.join(args.file_path,"dev.tsv") |
| path2 = os.path.join(args.file_path,"train.tsv") |
| path3 = os.path.join(args.file_path,"tata_dev.tsv") |
| path4 = os.path.join(args.file_path,"tata_train.tsv") |
| path5 = os.path.join(args.file_path,"notata_dev.tsv") |
| path6 = os.path.join(args.file_path,"notata_train.tsv") |
|
|
| file1 = open(path1, "r", encoding="utf-8-sig") |
| file2 = open(path2, "r", encoding="utf-8-sig") |
| file3 = open(path3, "r", encoding="utf-8-sig") |
| file4 = open(path4, "r", encoding="utf-8-sig") |
| file5 = open(path5, "r", encoding="utf-8-sig") |
| file6 = open(path6, "r", encoding="utf-8-sig") |
|
|
| test_lines = list(csv.reader(file1, delimiter="\t", quotechar=None)) |
| train_lines = list(csv.reader(file2, delimiter="\t", quotechar=None)) |
| tata_test_lines = list(csv.reader(file3, delimiter="\t", quotechar=None)) |
| tata_train_lines = list(csv.reader(file4, delimiter="\t", quotechar=None)) |
| notata_test_lines = list(csv.reader(file5, delimiter="\t", quotechar=None)) |
| notata_train_lines = list(csv.reader(file6, delimiter="\t", quotechar=None)) |
|
|
| LOAD = False |
|
|
|
|
|
|
| for kmer in range(3,7): |
|
|
| print(kmer) |
| root_path = os.path.join(output_path, str(kmer)) |
| if not os.path.exists(root_path): |
| os.makedirs(root_path) |
|
|
| all_path = os.path.join(root_path, "all") |
| |
| notata_path = os.path.join(root_path, "notata") |
| os.makedirs(all_path) |
| |
| os.makedirs(notata_path) |
|
|
| if LOAD: |
| seq_index=8 |
| label_index=6 |
| else: |
| seq_index=0 |
| label_index=1 |
| |
| print("writing dev") |
| write_file(test_lines, os.path.join(all_path,"dev.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index) |
| print("writing train") |
| write_file(train_lines, os.path.join(all_path,"train.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index) |
| |
| |
| |
| |
| print("writing notata dev") |
| write_file(notata_test_lines, os.path.join(notata_path,"dev.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index) |
| print("writing notata train") |
| write_file(notata_train_lines, os.path.join(notata_path,"train.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index) |
| |
|
|
| def Process_splice(args): |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| X_test = np.load(os.path.join(args.file_path, "x_test.npy")) |
| Y_test = np.load(os.path.join(args.file_path, "y_test.npy")) |
|
|
| assert len(X_test) == len(Y_test) |
| |
| for kmer in range(3,7): |
| root_path = os.path.join(args.file_path, str(kmer)) |
| os.makedirs(root_path) |
| f_test = open(os.path.join(root_path, "dev.tsv"), "wt") |
| tsv_test = csv.writer(f_test, delimiter='\t') |
| tsv_test.writerow(["seq", "label"]) |
|
|
| for i, seq in enumerate(X_test): |
| sequence = get_kmer_sentence(str(seq), kmer) |
| label = int(np.where(Y_test[i]==1)[0]) |
| tsv_test.writerow([sequence, label]) |
| |
| |
| def Process_prom_core(args): |
| random.seed(args.seed) |
|
|
| tata = args.file_path + "/TATA.csv" |
| notata = args.file_path + "/noTATA.csv" |
| tata_file = open(tata, "r", encoding="utf-8-sig") |
| notata_file = open(notata, "r", encoding="utf-8-sig") |
| tata_lines = list(csv.reader(tata_file, delimiter=",", quotechar=None))[1:] |
| notata_lines = list(csv.reader(notata_file, delimiter=",", quotechar=None))[1:] |
|
|
| random.shuffle(tata_lines) |
| random.shuffle(notata_lines) |
|
|
| num_tata_test = int(0.1*len(tata_lines)) |
| tata_test_lines = tata_lines[:num_tata_test] |
| num_notata_test = int(0.1*len(notata_lines)) |
| notata_test_lines = notata_lines[:num_notata_test] |
|
|
| train_lines = tata_lines[num_tata_test:] + notata_lines[num_notata_test:] |
| if args.dev: |
| num_dev = int(len(rest_lines)/9.0) |
| dev_lines = train_lines[:num_dev] |
| train_lines = train_lines[num_dev:] |
| else: |
| dev_lines = tata_test_lines + notata_test_lines |
| |
| print("Number train examples: %d" % (len(train_lines))) |
| print("Number dev examples: %d" % (len(dev_lines))) |
|
|
| for kmer in range(3,7): |
| root_path = os.path.join(args.file_path,str(kmer)) |
| tata_path = os.path.join(root_path, "tata") |
| notata_path = os.path.join(root_path, "notata") |
| os.makedirs(tata_path) |
| os.makedirs(notata_path) |
|
|
| write_file(tata_test_lines, os.path.join(tata_path,"dev.tsv"), kmer, head=False, seq_index=1, label_index=2) |
| write_file(notata_test_lines, os.path.join(notata_path,"dev.tsv"), kmer, head=False, seq_index=1, label_index=2) |
| write_file(train_lines, os.path.join(root_path,"train.tsv"), kmer, head=False, seq_index=1, label_index=2) |
| write_file(dev_lines, os.path.join(root_path,"dev.tsv"), kmer, head=False, seq_index=1, label_index=2) |
|
|
|
|
| def Process_pair(args): |
| random.seed(args.seed) |
|
|
| root_path = args.file_path.split('/')[-1] |
| train_seq1_file = open(args.file_path+"/"+root_path+"_enhancer.fasta", "r") |
| train_seq2_file = open(args.file_path+"/"+root_path+"_promoter.fasta", "r") |
| train_label_file = open(args.file_path+"/"+root_path+"_label.txt", "r") |
| test_seq1_file = open(args.file_path+"/"+root_path+"_enhancer_test.fasta", "r") |
| test_seq2_file = open(args.file_path+"/"+root_path+"_promoter_test.fasta", "r") |
| test_label_file = open(args.file_path+"/"+root_path+"_label_test.txt", "r") |
|
|
| train_seq1 = train_seq1_file.readlines() |
| train_seq2 = train_seq2_file.readlines() |
| train_label = train_label_file.readlines() |
| test_seq1 = test_seq1_file.readlines() |
| test_seq2 = test_seq2_file.readlines() |
| test_label = test_label_file.readlines() |
|
|
| train_lines = [] |
| test_lines = [] |
| for i in range(len(train_label)): |
| train_lines.append([train_seq1[2*i+1], train_seq2[2*i+1], train_label[i]]) |
| for i in range(len(test_label)): |
| test_lines.append([test_seq1[2*i+1], test_seq2[2*i+1], test_label[i]]) |
|
|
| random.shuffle(train_lines) |
|
|
| if args.dev: |
| num_dev = int(len(train_lines)/10) |
| dev_lines = train_lines[:num_dev] |
| train_lines = train_lines[num_dev:] |
| |
| output_path = args.output_path if args.output_path else os.path.join(args.file_path, str(args.kmer)) |
| if not os.path.exists(output_path): |
| os.makedirs(output_path) |
|
|
| f_train = open(os.path.join(output_path, "train.tsv"), 'wt') |
| train_w = csv.writer(f_train, delimiter='\t') |
| train_w.writerow(["seq1", "seq2", "label"]) |
| if args.dev: |
| f_dev = open(os.path.join(output_path, "dev.tsv"), 'wt') |
| dev_w = csv.writer(f_dev, delimiter='\t') |
| dev_w.writerow(["seq1", "seq2", "label"]) |
| os.makedirs(os.path.join(output_path, "test")) |
| f_test = open(os.path.join(output_path, "test", "dev.tsv"), 'wt') |
| test_w = csv.writer(f_test, delimiter='\t') |
| test_w.writerow(["seq1", "seq2", "label"]) |
| else: |
| f_test = open(os.path.join(output_path, "dev.tsv"), 'wt') |
| test_w = csv.writer(f_test, delimiter='\t') |
| test_w.writerow(["seq1", "seq2", "label"]) |
|
|
| def write_file_pair(lines, writer, seq1_index=0, seq2_index=1, label_index=2): |
| for line in lines: |
| seq1 = get_kmer_sentence(line[seq1_index],args.kmer) |
| seq2 = get_kmer_sentence(line[seq2_index],args.kmer) |
| writer.writerow([seq1, seq2, str(int(line[label_index]))]) |
|
|
| write_file_pair(train_lines, train_w) |
| write_file_pair(test_lines, test_w) |
| |
| if args.dev: |
| write_file_pair(dev_lines, dev_w) |
|
|
|
|
| def Process_p53_mut(args): |
| random.seed(args.seed) |
| |
| dev = os.path.join(args.file_path, "dev.csv") |
| dev_file = open(dev, "r", encoding="utf-8-sig") |
|
|
| lines = list(csv.reader(dev_file, delimiter=",", quotechar=None))[1:] |
|
|
| print(lines[0]) |
|
|
| for kmer in range(3, 7): |
| output_path = args.output_path if args.output_path else os.path.join(args.file_path, str(kmer)) |
| if not os.path.exists(output_path): |
| os.makedirs(output_path) |
| |
| write_file(lines, os.path.join(output_path, "dev.tsv"), kmer, head=True, seq_index=2, label_index=None) |
|
|
|
|
| def Process_p53(args): |
| random.seed(args.seed) |
| |
| train = os.path.join(args.file_path, "train.csv") |
| test = os.path.join(args.file_path, "test.csv") |
| train_file = open(train, "r", encoding="utf-8-sig") |
| test_file = open(test, "r", encoding="utf-8-sig") |
|
|
| train_lines = list(csv.reader(train_file, delimiter=",", quotechar=None))[1:] |
| test_lines = list(csv.reader(test_file, delimiter=",", quotechar=None))[1:] |
| lines = train_lines + test_lines |
|
|
| max_length = 0 |
| for line in lines: |
| if len(line[2]) > max_length: |
| max_length = len(line[2]) |
|
|
| random.shuffle(train_lines) |
| random.shuffle(test_lines) |
|
|
| if args.dev: |
| num_dev = int(len(train_lines)/9) |
| dev_lines = train_lines[:num_dev] |
| train_lines = train_lines[num_dev:] |
|
|
| print(train_lines[0]) |
|
|
| for kmer in range(3, 7): |
| output_path = args.output_path if args.output_path else os.path.join(args.file_path, str(kmer)) |
| if not os.path.exists(output_path): |
| os.makedirs(output_path) |
| |
| write_file(train_lines, os.path.join(output_path, "train.tsv"), kmer, head=True, seq_index=2, label_index=3) |
| if args.dev: |
| write_file(dev_lines, os.path.join(output_path, "dev.tsv"), kmer, head=True, seq_index=2, label_index=3) |
| os.makedirs(os.path.join(output_path, "test")) |
| write_file(test_lines, os.path.join(output_path, "test", "dev.tsv"), kmer, head=True, seq_index=2, label_index=3) |
| else: |
| write_file(test_lines, os.path.join(output_path, "dev.tsv"), kmer, head=True, seq_index=2, label_index=3) |
|
|
| print("max length: %d" % (max_length)) |
|
|
|
|
| def Seperate_p53(args): |
| random.seed(args.seed) |
| |
| train = os.path.join(args.file_path, "train.csv") |
| test = os.path.join(args.file_path, "test.csv") |
| train_file = open(train, "r", encoding="utf-8-sig") |
| test_file = open(test, "r", encoding="utf-8-sig") |
|
|
| train_lines = list(csv.reader(train_file, delimiter=",", quotechar=None))[1:] |
| test_lines = list(csv.reader(test_file, delimiter=",", quotechar=None))[1:] |
| lines = train_lines + test_lines |
|
|
| POS = [] |
| NEG = [] |
|
|
| for line in lines: |
| if str(line[-1]) == '0': |
| NEG.append([line[-2], line[-1]]) |
| else: |
| POS.append([line[-2], line[-1]]) |
|
|
| |
|
|
| for kmer in range(3,7): |
| os.makedirs(os.path.join(args.file_path, "POS", str(kmer))) |
| os.makedirs(os.path.join(args.file_path, "NEG", str(kmer))) |
|
|
| write_file(POS, os.path.join(args.file_path, "POS", str(kmer), "dev.tsv"), kmer=kmer, head=True, seq_index=0, label_index=1) |
| write_file(NEG, os.path.join(args.file_path, "NEG", str(kmer), "dev.tsv"), kmer=kmer, head=True, seq_index=0, label_index=1) |
| |
|
|
|
|
| def Generate_prom_train_dev(args): |
| |
| tata = args.file_path + "/noTATA_249to50.tsv" |
| notata = args.file_path + "/TATA_249to50.tsv" |
| tata_file = open(tata, "r", encoding="utf-8-sig") |
| notata_file = open(notata, "r", encoding="utf-8-sig") |
| tata_lines = list(csv.reader(tata_file, delimiter="\t", quotechar=None))[1:] |
| notata_lines = list(csv.reader(notata_file, delimiter="\t", quotechar=None))[1:] |
| |
| |
| |
| random.shuffle(tata_lines) |
| random.shuffle(notata_lines) |
| num_tata_test = int(len(tata_lines)*0.1) |
| tata_test_lines = tata_lines[:num_tata_test] |
| num_notata_test = int(len(notata_lines)*0.1) |
| notata_test_lines = notata_lines[:num_notata_test] |
| train_lines = tata_lines[num_tata_test:] + notata_lines[num_notata_test:] |
| test_lines = tata_test_lines + notata_test_lines |
|
|
|
|
| write_file(train_lines, args.file_path+"/train.tsv", args.kmer) |
| write_file(test_lines, args.file_path+"/dev.tsv", args.kmer) |
| write_file(tata_test_lines, args.file_path+"/tata_dev.tsv", args.kmer) |
| write_file(notata_test_lines, args.file_path+"/notata_dev.tsv", args.kmer) |
|
|
| def Process_690(args): |
| path = args.file_path |
| all_folders = os.listdir(path) |
| |
| count = 0 |
|
|
| for folder in all_folders: |
| |
| train_seq_path = os.path.join(args.file_path, folder, "train", "sequences_alph.npy") |
| test_seq_path = os.path.join(args.file_path, folder, "test", "sequences_alph.npy") |
| train_lab_path = os.path.join(args.file_path, folder, "train", "targets.npy") |
| test_lab_path = os.path.join(args.file_path, folder, "test", "targets.npy") |
| train_sequences = np.load(train_seq_path) |
| test_sequences = np.load(test_seq_path) |
| train_labels = np.load(train_lab_path) |
| test_labels = np.load(test_lab_path) |
|
|
| train_sequences = train_sequences.reshape(train_sequences.shape[0],1) |
| test_sequences = test_sequences.reshape(test_sequences.shape[0],1) |
| train_labels = train_labels.reshape(train_labels.shape[0],1) |
| test_labels = test_labels.reshape(test_labels.shape[0],1) |
|
|
| |
| trains = list(np.concatenate((train_sequences, train_labels), axis=1)) |
| tests = list(np.concatenate((test_sequences, test_labels), axis=1)) |
|
|
| random.seed(args.seed) |
| random.shuffle(trains) |
| random.shuffle(trains) |
| random.shuffle(tests) |
| random.shuffle(tests) |
|
|
|
|
| |
| output_path = os.path.join(args.output_path, str(args.kmer), folder) |
| if not os.path.exists(output_path): |
| os.makedirs(output_path) |
|
|
| |
|
|
| |
| f_train = open(os.path.join(output_path, "train.tsv"), 'wt') |
| tsv_train = csv.writer(f_train, delimiter='\t') |
| tsv_train.writerow(["sequence", "label"]) |
| for i in range(len(trains)): |
| sentence = get_kmer_sentence(trains[i][0].decode("utf-8"), args.kmer) |
| tsv_train.writerow([sentence, int(trains[i][1])]) |
|
|
| f_dev = open(os.path.join(output_path, "dev.tsv"), 'wt') |
| tsv_dev = csv.writer(f_dev, delimiter='\t') |
| tsv_dev.writerow(["sequence", "label"]) |
| for i in range(len(tests)): |
| sentence = get_kmer_sentence(tests[i][0].decode("utf-8"), args.kmer) |
| tsv_dev.writerow([sentence, int(tests[i][1])]) |
| |
|
|
| count += 1 |
| print("Finish %s folders" % (count)) |
|
|
|
|
| def Process_mouse(args): |
| random.seed(args.seed) |
|
|
| files = os.listdir(args.file_path) |
|
|
| try: |
| files.remove("3") |
| files.remove("4") |
| files.remove("5") |
| files.remove("6") |
| except ValueError: |
| files = files |
|
|
| files.sort() |
| assert len(files) % 2 == 0 |
|
|
| num_task = int(len(files)/2) |
|
|
| max_length = 0 |
|
|
| for i in range(num_task): |
| index = str(i) if i > 9 else "0" + str(i) |
|
|
| test_name = files[2*i].replace("test", "train") |
| train_name = files[2*i+1] |
| assert test_name == train_name |
|
|
| test_file = os.path.join(args.file_path, files[2*i]) |
| train_file = os.path.join(args.file_path, files[2*i+1]) |
| train_file = open(train_file, "r", encoding="utf-8-sig") |
| test_file = open(test_file, "r", encoding="utf-8-sig") |
| train_lines = list(csv.reader(train_file, delimiter=",", quotechar=None))[1:] |
| test_lines = list(csv.reader(test_file, delimiter=",", quotechar=None))[1:] |
|
|
| print("dataset %d : %d lines" % (i, len(train_lines))) |
|
|
| |
|
|
| |
| |
| |
| |
|
|
|
|
|
|
| def Process(args): |
| if args.output_path != None: |
| output_path = args.output_path |
| else: |
| root_path = "/".join(args.file_path.split("/")[:-1]) + "/" + str(args.kmer) + "/" |
| output_path = root_path + args.file_path.split("/")[-1] |
| if not os.path.exists(root_path): |
| os.makedirs(root_path) |
| |
| old_file = open(args.file_path, "r", encoding="utf-8-sig") |
| lines = list(csv.reader(old_file, delimiter=args.delimiter, quotechar=None)) |
|
|
| write_file(lines, output_path, args.kmer, head=args.head, seq_index=args.seq_index, label_index=args.label_index) |
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--kmer", |
| default=1, |
| type=int, |
| help="K-mer", |
| ) |
| parser.add_argument( |
| "--seed", |
| default=24, |
| type=int, |
| help="Which random seed to use", |
| ) |
| parser.add_argument( |
| "--task", |
| default="", |
| type=str, |
| help="which task to do", |
| ) |
| parser.add_argument( |
| "--file_path", |
| default=None, |
| type=str, |
| help="The path of the file to be processed", |
| ) |
| parser.add_argument( |
| "--output_path", |
| default=None, |
| type=str, |
| help="The path of the processed data", |
| ) |
| parser.add_argument( |
| "--delimiter", |
| default=',', |
| type=str, |
| help="The path of the processed data", |
| ) |
| parser.add_argument( |
| "--head", |
| action="store_true", |
| help="The path of the processed data", |
| ) |
| parser.add_argument( |
| "--dev", |
| action="store_true", |
| help="Use this flag to split data as (8:1:1), else (9:1)", |
| ) |
| parser.add_argument( |
| "--seq_index", |
| default=2, |
| type=int, |
| help="index of seq in the original csv file", |
| ) |
| parser.add_argument( |
| "--label_index", |
| default=3, |
| type=int, |
| help="index of label in the original csv file", |
| ) |
| args = parser.parse_args() |
|
|
| if args.task == "generate_prom": |
| Generate_prom_train_dev(args) |
| elif args.task == "shuffle": |
| Shuffle(args) |
| elif args.task == "find_train": |
| Find_train(args) |
| elif args.task == "prom_1000": |
| Process_1000(args) |
| elif args.task == "prom_1000_kmer": |
| Process_1000_kmer(args) |
| elif args.task == "splice": |
| Process_splice(args) |
| elif args.task == "pair": |
| Process_pair(args) |
| elif args.task == "p53": |
| Process_p53(args) |
| elif args.task == "p53_mut": |
| Process_p53_mut(args) |
| elif args.task == "sep_p53": |
| Seperate_p53(args) |
| elif args.task == "690": |
| Process_690(args) |
| elif args.task == "mouse": |
| Process_mouse(args) |
| elif args.task == "prom-core": |
| Process_prom_core(args) |
| else: |
| Process(args) |
| |
| |
|
|
| |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|