|
import sys |
|
import numpy as np |
|
import random |
|
|
|
|
|
class ContigMap: |
|
""" |
|
Class for doing mapping. |
|
Inherited from Inpainting. To update at some point. |
|
Supports multichain or multiple crops from a single receptor chain. |
|
Also supports indexing jump (+200) or not, based on contig input. |
|
Default chain outputs are inpainted chains as A (and B, C etc if multiple chains), and all fragments of receptor chain on the next one (generally B) |
|
Output chains can be specified. Sequence must be the same number of elements as in contig string |
|
""" |
|
|
|
def __init__( |
|
self, |
|
parsed_pdb, |
|
contigs=None, |
|
inpaint_seq=None, |
|
inpaint_str=None, |
|
length=None, |
|
ref_idx=None, |
|
hal_idx=None, |
|
idx_rf=None, |
|
inpaint_seq_tensor=None, |
|
inpaint_str_tensor=None, |
|
topo=False, |
|
provide_seq=None, |
|
): |
|
|
|
if contigs is None and ref_idx is None: |
|
sys.exit("Must either specify a contig string or precise mapping") |
|
if idx_rf is not None or hal_idx is not None or ref_idx is not None: |
|
if idx_rf is None or hal_idx is None or ref_idx is None: |
|
sys.exit( |
|
"If you're specifying specific contig mappings, the reference and output positions must be specified, AND the indexing for RoseTTAFold (idx_rf)" |
|
) |
|
|
|
self.chain_order = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
|
if length is not None: |
|
if "-" not in length: |
|
self.length = [int(length), int(length) + 1] |
|
else: |
|
self.length = [int(length.split("-")[0]), int(length.split("-")[1]) + 1] |
|
else: |
|
self.length = None |
|
self.ref_idx = ref_idx |
|
self.hal_idx = hal_idx |
|
self.idx_rf = idx_rf |
|
self.inpaint_seq = ( |
|
"/".join(inpaint_seq).split("/") if inpaint_seq is not None else None |
|
) |
|
self.inpaint_str = ( |
|
"/".join(inpaint_str).split("/") if inpaint_str is not None else None |
|
) |
|
self.inpaint_seq_tensor = inpaint_seq_tensor |
|
self.inpaint_str_tensor = inpaint_str_tensor |
|
self.parsed_pdb = parsed_pdb |
|
self.topo = topo |
|
if ref_idx is None: |
|
|
|
self.contigs = contigs |
|
( |
|
self.sampled_mask, |
|
self.contig_length, |
|
self.n_inpaint_chains, |
|
) = self.get_sampled_mask() |
|
self.receptor_chain = self.chain_order[self.n_inpaint_chains] |
|
( |
|
self.receptor, |
|
self.receptor_hal, |
|
self.receptor_rf, |
|
self.inpaint, |
|
self.inpaint_hal, |
|
self.inpaint_rf, |
|
) = self.expand_sampled_mask() |
|
self.ref = self.inpaint + self.receptor |
|
self.hal = self.inpaint_hal + self.receptor_hal |
|
self.rf = self.inpaint_rf + self.receptor_rf |
|
else: |
|
|
|
self.ref = ref_idx |
|
self.hal = hal_idx |
|
self.rf = idx_rf |
|
self.mask_1d = [False if i == ("_", "_") else True for i in self.ref] |
|
|
|
if self.inpaint_seq_tensor is None: |
|
if self.inpaint_seq is not None: |
|
self.inpaint_seq = self.get_inpaint_seq_str(self.inpaint_seq) |
|
else: |
|
self.inpaint_seq = np.array( |
|
[True if i != ("_", "_") else False for i in self.ref] |
|
) |
|
else: |
|
self.inpaint_seq = self.inpaint_seq_tensor |
|
|
|
if self.inpaint_str_tensor is None: |
|
if self.inpaint_str is not None: |
|
self.inpaint_str = self.get_inpaint_seq_str(self.inpaint_str) |
|
else: |
|
self.inpaint_str = np.array( |
|
[True if i != ("_", "_") else False for i in self.ref] |
|
) |
|
else: |
|
self.inpaint_str = self.inpaint_str_tensor |
|
|
|
( |
|
self.ref_idx0, |
|
self.hal_idx0, |
|
self.ref_idx0_inpaint, |
|
self.hal_idx0_inpaint, |
|
self.ref_idx0_receptor, |
|
self.hal_idx0_receptor, |
|
) = self.get_idx0() |
|
self.con_ref_pdb_idx = [i for i in self.ref if i != ("_", "_")] |
|
|
|
|
|
if provide_seq is not None: |
|
for i in provide_seq: |
|
if "-" in i: |
|
self.inpaint_seq[ |
|
int(i.split("-")[0]) : int(i.split("-")[1]) + 1 |
|
] = True |
|
else: |
|
self.inpaint_seq[int(i)] = True |
|
|
|
def get_sampled_mask(self): |
|
""" |
|
Function to get a sampled mask from a contig. |
|
""" |
|
length_compatible = False |
|
count = 0 |
|
while length_compatible is False: |
|
inpaint_chains = 0 |
|
contig_list = self.contigs[0].strip().split() |
|
sampled_mask = [] |
|
sampled_mask_length = 0 |
|
|
|
if all([i[0].isalpha() for i in contig_list[-1].split("/")]): |
|
contig_list[-1] = f"{contig_list[-1]}/0" |
|
for con in contig_list: |
|
if ( |
|
all([i[0].isalpha() for i in con.split("/")[:-1]]) |
|
and con.split("/")[-1] == "0" |
|
) or self.topo is True: |
|
|
|
sampled_mask.append(con) |
|
else: |
|
inpaint_chains += 1 |
|
|
|
subcons = con.split("/") |
|
subcon_out = [] |
|
for subcon in subcons: |
|
if subcon[0].isalpha(): |
|
subcon_out.append(subcon) |
|
if "-" in subcon: |
|
sampled_mask_length += ( |
|
int(subcon.split("-")[1]) |
|
- int(subcon.split("-")[0][1:]) |
|
+ 1 |
|
) |
|
else: |
|
sampled_mask_length += 1 |
|
|
|
else: |
|
if "-" in subcon: |
|
length_inpaint = random.randint( |
|
int(subcon.split("-")[0]), int(subcon.split("-")[1]) |
|
) |
|
subcon_out.append(f"{length_inpaint}-{length_inpaint}") |
|
sampled_mask_length += length_inpaint |
|
elif subcon == "0": |
|
subcon_out.append("0") |
|
else: |
|
length_inpaint = int(subcon) |
|
subcon_out.append(f"{length_inpaint}-{length_inpaint}") |
|
sampled_mask_length += int(subcon) |
|
sampled_mask.append("/".join(subcon_out)) |
|
|
|
if self.length is not None: |
|
if ( |
|
sampled_mask_length >= self.length[0] |
|
and sampled_mask_length < self.length[1] |
|
): |
|
length_compatible = True |
|
else: |
|
length_compatible = True |
|
count += 1 |
|
if count == 100000: |
|
sys.exit("Contig string incompatible with --length range") |
|
return sampled_mask, sampled_mask_length, inpaint_chains |
|
|
|
def expand_sampled_mask(self): |
|
chain_order = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
|
receptor = [] |
|
inpaint = [] |
|
receptor_hal = [] |
|
inpaint_hal = [] |
|
receptor_idx = 1 |
|
inpaint_idx = 1 |
|
inpaint_chain_idx = -1 |
|
receptor_chain_break = [] |
|
inpaint_chain_break = [] |
|
for con in self.sampled_mask: |
|
if ( |
|
all([i[0].isalpha() for i in con.split("/")[:-1]]) |
|
and con.split("/")[-1] == "0" |
|
) or self.topo is True: |
|
|
|
subcons = con.split("/")[:-1] |
|
assert all( |
|
[i[0] == subcons[0][0] for i in subcons] |
|
), "If specifying fragmented receptor in a single block of the contig string, they MUST derive from the same chain" |
|
assert all( |
|
int(subcons[i].split("-")[0][1:]) |
|
< int(subcons[i + 1].split("-")[0][1:]) |
|
for i in range(len(subcons) - 1) |
|
), "If specifying multiple fragments from the same chain, pdb indices must be in ascending order!" |
|
for idx, subcon in enumerate(subcons): |
|
ref_to_add = [ |
|
(subcon[0], i) |
|
for i in np.arange( |
|
int(subcon.split("-")[0][1:]), int(subcon.split("-")[1]) + 1 |
|
) |
|
] |
|
receptor.extend(ref_to_add) |
|
receptor_hal.extend( |
|
[ |
|
(self.receptor_chain, i) |
|
for i in np.arange( |
|
receptor_idx, receptor_idx + len(ref_to_add) |
|
) |
|
] |
|
) |
|
receptor_idx += len(ref_to_add) |
|
if idx != len(subcons) - 1: |
|
idx_jump = ( |
|
int(subcons[idx + 1].split("-")[0][1:]) |
|
- int(subcon.split("-")[1]) |
|
- 1 |
|
) |
|
receptor_chain_break.append( |
|
(receptor_idx - 1, idx_jump) |
|
) |
|
else: |
|
receptor_chain_break.append( |
|
(receptor_idx - 1, 200) |
|
) |
|
else: |
|
inpaint_chain_idx += 1 |
|
for subcon in con.split("/"): |
|
if subcon[0].isalpha(): |
|
ref_to_add = [ |
|
(subcon[0], i) |
|
for i in np.arange( |
|
int(subcon.split("-")[0][1:]), |
|
int(subcon.split("-")[1]) + 1, |
|
) |
|
] |
|
inpaint.extend(ref_to_add) |
|
inpaint_hal.extend( |
|
[ |
|
(chain_order[inpaint_chain_idx], i) |
|
for i in np.arange( |
|
inpaint_idx, inpaint_idx + len(ref_to_add) |
|
) |
|
] |
|
) |
|
inpaint_idx += len(ref_to_add) |
|
|
|
else: |
|
inpaint.extend([("_", "_")] * int(subcon.split("-")[0])) |
|
inpaint_hal.extend( |
|
[ |
|
(chain_order[inpaint_chain_idx], i) |
|
for i in np.arange( |
|
inpaint_idx, inpaint_idx + int(subcon.split("-")[0]) |
|
) |
|
] |
|
) |
|
inpaint_idx += int(subcon.split("-")[0]) |
|
inpaint_chain_break.append((inpaint_idx - 1, 200)) |
|
|
|
if self.topo is True or inpaint_hal == []: |
|
receptor_hal = [(i[0], i[1]) for i in receptor_hal] |
|
else: |
|
receptor_hal = [ |
|
(i[0], i[1] + inpaint_hal[-1][1]) for i in receptor_hal |
|
] |
|
|
|
inpaint_rf = np.arange(0, len(inpaint)) |
|
receptor_rf = np.arange(len(inpaint) + 200, len(inpaint) + len(receptor) + 200) |
|
for ch_break in inpaint_chain_break[:-1]: |
|
receptor_rf[:] += 200 |
|
inpaint_rf[ch_break[0] :] += ch_break[1] |
|
for ch_break in receptor_chain_break[:-1]: |
|
receptor_rf[ch_break[0] :] += ch_break[1] |
|
|
|
return ( |
|
receptor, |
|
receptor_hal, |
|
receptor_rf.tolist(), |
|
inpaint, |
|
inpaint_hal, |
|
inpaint_rf.tolist(), |
|
) |
|
|
|
def get_inpaint_seq_str(self, inpaint_s): |
|
""" |
|
function to generate inpaint_str or inpaint_seq masks specific to this contig |
|
""" |
|
s_mask = np.copy(self.mask_1d) |
|
inpaint_s_list = [] |
|
for i in inpaint_s: |
|
if "-" in i: |
|
inpaint_s_list.extend( |
|
[ |
|
(i[0], p) |
|
for p in range( |
|
int(i.split("-")[0][1:]), int(i.split("-")[1]) + 1 |
|
) |
|
] |
|
) |
|
else: |
|
inpaint_s_list.append((i[0], int(i[1:]))) |
|
for res in inpaint_s_list: |
|
if res in self.ref: |
|
s_mask[self.ref.index(res)] = False |
|
|
|
return np.array(s_mask) |
|
|
|
def get_idx0(self): |
|
ref_idx0 = [] |
|
hal_idx0 = [] |
|
ref_idx0_inpaint = [] |
|
hal_idx0_inpaint = [] |
|
ref_idx0_receptor = [] |
|
hal_idx0_receptor = [] |
|
for idx, val in enumerate(self.ref): |
|
if val != ("_", "_"): |
|
assert val in self.parsed_pdb["pdb_idx"], f"{val} is not in pdb file!" |
|
hal_idx0.append(idx) |
|
ref_idx0.append(self.parsed_pdb["pdb_idx"].index(val)) |
|
for idx, val in enumerate(self.inpaint): |
|
if val != ("_", "_"): |
|
hal_idx0_inpaint.append(idx) |
|
ref_idx0_inpaint.append(self.parsed_pdb["pdb_idx"].index(val)) |
|
for idx, val in enumerate(self.receptor): |
|
if val != ("_", "_"): |
|
hal_idx0_receptor.append(idx) |
|
ref_idx0_receptor.append(self.parsed_pdb["pdb_idx"].index(val)) |
|
|
|
return ( |
|
ref_idx0, |
|
hal_idx0, |
|
ref_idx0_inpaint, |
|
hal_idx0_inpaint, |
|
ref_idx0_receptor, |
|
hal_idx0_receptor, |
|
) |
|
|
|
def get_mappings(self): |
|
mappings = {} |
|
mappings["con_ref_pdb_idx"] = [i for i in self.inpaint if i != ("_", "_")] |
|
mappings["con_hal_pdb_idx"] = [ |
|
self.inpaint_hal[i] |
|
for i in range(len(self.inpaint_hal)) |
|
if self.inpaint[i] != ("_", "_") |
|
] |
|
mappings["con_ref_idx0"] = np.array(self.ref_idx0_inpaint) |
|
mappings["con_hal_idx0"] = np.array(self.hal_idx0_inpaint) |
|
if self.inpaint != self.ref: |
|
mappings["complex_con_ref_pdb_idx"] = [ |
|
i for i in self.ref if i != ("_", "_") |
|
] |
|
mappings["complex_con_hal_pdb_idx"] = [ |
|
self.hal[i] for i in range(len(self.hal)) if self.ref[i] != ("_", "_") |
|
] |
|
mappings["receptor_con_ref_pdb_idx"] = [ |
|
i for i in self.receptor if i != ("_", "_") |
|
] |
|
mappings["receptor_con_hal_pdb_idx"] = [ |
|
self.receptor_hal[i] |
|
for i in range(len(self.receptor_hal)) |
|
if self.receptor[i] != ("_", "_") |
|
] |
|
mappings["complex_con_ref_idx0"] = np.array(self.ref_idx0) |
|
mappings["complex_con_hal_idx0"] = np.array(self.hal_idx0) |
|
mappings["receptor_con_ref_idx0"] = np.array(self.ref_idx0_receptor) |
|
mappings["receptor_con_hal_idx0"] = np.array(self.hal_idx0_receptor) |
|
mappings["inpaint_str"] = self.inpaint_str |
|
mappings["inpaint_seq"] = self.inpaint_seq |
|
mappings["sampled_mask"] = self.sampled_mask |
|
mappings["mask_1d"] = self.mask_1d |
|
return mappings |
|
|