Spaces:
Runtime error
Runtime error
import random | |
import biotite | |
import numpy as np | |
import torch.utils.data as data | |
from typing import List | |
from biotite.structure.residues import get_residues | |
from biotite.sequence import ProteinSequence | |
from biotite.structure.io import pdbx, pdb | |
from biotite.structure import filter_backbone | |
from biotite.structure import get_chains | |
def load_structure(fpath, chain=None): | |
""" | |
Args: | |
fpath: filepath to either pdb or cif file | |
chain: the chain id or list of chain ids to load | |
Returns: | |
biotite.structure.AtomArray | |
""" | |
if fpath.endswith('cif'): | |
with open(fpath) as fin: | |
pdbxf = pdbx.PDBxFile.read(fin) | |
structure = pdbx.get_structure(pdbxf, model=1) | |
elif fpath.endswith('pdb'): | |
with open(fpath) as fin: | |
pdbf = pdb.PDBFile.read(fin) | |
structure = pdb.get_structure(pdbf, model=1) | |
bbmask = filter_backbone(structure) | |
structure = structure[bbmask] | |
all_chains = get_chains(structure) | |
if len(all_chains) == 0: | |
raise ValueError('No chains found in the input file.') | |
if chain is None: | |
chain_ids = all_chains | |
elif isinstance(chain, list): | |
chain_ids = chain | |
else: | |
chain_ids = [chain] | |
for chain in chain_ids: | |
if chain not in all_chains: | |
raise ValueError(f'Chain {chain} not found in input file') | |
chain_filter = [a.chain_id in chain_ids for a in structure] | |
structure = structure[chain_filter] | |
return structure | |
def get_atom_coords_residuewise(atoms: List[str], struct: biotite.structure.AtomArray): | |
""" | |
Example for atoms argument: ["N", "CA", "C"] | |
""" | |
def filterfn(s, axis=None): | |
filters = np.stack([s.atom_name == name for name in atoms], axis=1) | |
sum = filters.sum(0) | |
if not np.all(sum <= np.ones(filters.shape[1])): | |
raise RuntimeError("structure has multiple atoms with same name") | |
index = filters.argmax(0) | |
coords = s[index].coord | |
coords[sum == 0] = float("nan") | |
return coords | |
return biotite.structure.apply_residue_wise(struct, struct, filterfn) | |
def extract_coords_from_structure(structure: biotite.structure.AtomArray): | |
""" | |
Args: | |
structure: An instance of biotite AtomArray | |
Returns: | |
Tuple (coords, seq) | |
- coords is an L x 3 x 3 array for N, CA, C coordinates | |
- seq is the extracted sequence | |
""" | |
coords = get_atom_coords_residuewise(["N", "CA", "C"], structure) | |
residue_identities = get_residues(structure)[1] | |
seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities]) | |
return coords | |
def extract_seq_from_pdb(pdb_file, chain=None): | |
""" | |
Args: | |
structure: An instance of biotite AtomArray | |
Returns: | |
- seq is the extracted sequence | |
""" | |
structure = load_structure(pdb_file, chain) | |
residue_identities = get_residues(structure)[1] | |
seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities]) | |
return seq | |