predict-tfbs / utils.py
moslem's picture
Update utils.py
dd2df80 verified
import numpy as np
def dnaseq_features(seq):
start = 0
n_segs = 101
seq_name = 'seq'
seq = seq.strip().upper()
# اگر طول توالی کمتر از n_segs بود
if len(seq) < n_segs:
raise ValueError(f"Sequence too short ({len(seq)} bp). Must be at least {n_segs} bases long.")
remaind = len(seq) % n_segs
if remaind != 0:
last_id = len(seq) - remaind
upd_seq = seq[start:last_id]
else:
upd_seq = seq # کل توالی استفاده شود اگر مضرب کامل است
dic_seq = {}
for i in range(0, len(upd_seq) // n_segs):
a = int(i * n_segs)
b = int(i * n_segs) + n_segs
identifier = f"{seq_name}_{a}:{b}"
dic_seq[identifier] = upd_seq[a:b]
lst_seq = dic_seq.values()
index = list(dic_seq.keys())
values = list(dic_seq.values())
# One hot encode
abc = 'ACGT'
char_to_int = dict((c, i) for i, c in enumerate(abc))
matrix_list = []
for data in lst_seq:
int_enc = [char_to_int[char] for char in data if char in abc]
ohe = []
for value in int_enc:
base = [0 for _ in range(len(abc))]
base[value] = 1
ohe.append(base)
np_mat = np.array(ohe)
np_mat = np.expand_dims(np_mat, axis=0)
matrix_list.append(np_mat)
matrix = np.concatenate(matrix_list, axis=0)
return matrix, index, values