BLEACH366
/

P2DFlow

Model card Files Files and versions

P2DFlow / data /parsers.py

Holmes

test

ca7299e 7 months ago

history blame contribute delete

2.97 kB

	# Copyright 2021 AlQuraishi Laboratory
	# Copyright 2021 DeepMind Technologies Limited
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Library for parsing different data structures.
	Code adapted from Openfold protein.py.
	"""
	from Bio.PDB.Chain import Chain
	import numpy as np

	from data import residue_constants
	from data import protein

	Protein = protein.Protein


	def process_chain(chain: Chain, chain_id: str) -> Protein:
	"""Convert a PDB chain object into a AlphaFold Protein instance.

	Forked from alphafold.common.protein.from_pdb_string

	WARNING: All non-standard residue types will be converted into UNK. All
	non-standard atoms will be ignored.

	Took out lines 94-97 which don't allow insertions in the PDB.
	Sabdab uses insertions for the chothia numbering so we need to allow them.

	Took out lines 110-112 since that would mess up CDR numbering.

	Args:
	chain: Instance of Biopython's chain class.

	Returns:
	Protein object with protein features.
	"""
	atom_positions = []
	aatype = []
	atom_mask = []
	residue_index = []
	b_factors = []
	chain_ids = []
	for res in chain:
	# for residue type "X", the chain may need to be removed
	res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
	restype_idx = residue_constants.restype_order.get(
	res_shortname, residue_constants.restype_num)
	pos = np.zeros((residue_constants.atom_type_num, 3))
	mask = np.zeros((residue_constants.atom_type_num,))
	res_b_factors = np.zeros((residue_constants.atom_type_num,))
	for atom in res:
	if atom.name not in residue_constants.atom_types:
	continue
	pos[residue_constants.atom_order[atom.name]] = atom.coord
	mask[residue_constants.atom_order[atom.name]] = 1.
	res_b_factors[residue_constants.atom_order[atom.name]
	] = atom.bfactor
	aatype.append(restype_idx)
	atom_positions.append(pos)
	atom_mask.append(mask)
	residue_index.append(res.id[1])
	b_factors.append(res_b_factors)
	chain_ids.append(chain_id)

	return Protein(
	atom_positions=np.array(atom_positions),
	atom_mask=np.array(atom_mask),
	aatype=np.array(aatype),
	residue_index=np.array(residue_index),
	chain_index=np.array(chain_ids),
	b_factors=np.array(b_factors))