Spaces:
Starting
on
T4
Starting
on
T4
# Copyright 2021 DeepMind Technologies Limited | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Parses the mmCIF file format.""" | |
import collections | |
import dataclasses | |
import io | |
from typing import Any, Mapping, Optional, Sequence, Tuple | |
from absl import logging | |
from Bio import PDB | |
from Bio.Data import SCOPData | |
# Type aliases: | |
ChainId = str | |
PdbHeader = Mapping[str, Any] | |
PdbStructure = PDB.Structure.Structure | |
SeqRes = str | |
MmCIFDict = Mapping[str, Sequence[str]] | |
class Monomer: | |
id: str | |
num: int | |
# Note - mmCIF format provides no guarantees on the type of author-assigned | |
# sequence numbers. They need not be integers. | |
class AtomSite: | |
residue_name: str | |
author_chain_id: str | |
mmcif_chain_id: str | |
author_seq_num: str | |
mmcif_seq_num: int | |
insertion_code: str | |
hetatm_atom: str | |
model_num: int | |
# Used to map SEQRES index to a residue in the structure. | |
class ResiduePosition: | |
chain_id: str | |
residue_number: int | |
insertion_code: str | |
class ResidueAtPosition: | |
position: Optional[ResiduePosition] | |
name: str | |
is_missing: bool | |
hetflag: str | |
class MmcifObject: | |
"""Representation of a parsed mmCIF file. | |
Contains: | |
file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all | |
files being processed. | |
header: Biopython header. | |
structure: Biopython structure. | |
chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g. | |
{'A': 'ABCDEFG'} | |
seqres_to_structure: Dict; for each chain_id contains a mapping between | |
SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition, | |
1: ResidueAtPosition, | |
...}} | |
raw_string: The raw string used to construct the MmcifObject. | |
""" | |
file_id: str | |
header: PdbHeader | |
structure: PdbStructure | |
chain_to_seqres: Mapping[ChainId, SeqRes] | |
seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]] | |
raw_string: Any | |
class ParsingResult: | |
"""Returned by the parse function. | |
Contains: | |
mmcif_object: A MmcifObject, may be None if no chain could be successfully | |
parsed. | |
errors: A dict mapping (file_id, chain_id) to any exception generated. | |
""" | |
mmcif_object: Optional[MmcifObject] | |
errors: Mapping[Tuple[str, str], Any] | |
class ParseError(Exception): | |
"""An error indicating that an mmCIF file could not be parsed.""" | |
def mmcif_loop_to_list(prefix: str, | |
parsed_info: MmCIFDict) -> Sequence[Mapping[str, str]]: | |
"""Extracts loop associated with a prefix from mmCIF data as a list. | |
Reference for loop_ in mmCIF: | |
http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html | |
Args: | |
prefix: Prefix shared by each of the data items in the loop. | |
e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num, | |
_entity_poly_seq.mon_id. Should include the trailing period. | |
parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython | |
parser. | |
Returns: | |
Returns a list of dicts; each dict represents 1 entry from an mmCIF loop. | |
""" | |
cols = [] | |
data = [] | |
for key, value in parsed_info.items(): | |
if key.startswith(prefix): | |
cols.append(key) | |
data.append(value) | |
assert all([len(xs) == len(data[0]) for xs in data]), ( | |
'mmCIF error: Not all loops are the same length: %s' % cols) | |
return [dict(zip(cols, xs)) for xs in zip(*data)] | |
def mmcif_loop_to_dict(prefix: str, | |
index: str, | |
parsed_info: MmCIFDict, | |
) -> Mapping[str, Mapping[str, str]]: | |
"""Extracts loop associated with a prefix from mmCIF data as a dictionary. | |
Args: | |
prefix: Prefix shared by each of the data items in the loop. | |
e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num, | |
_entity_poly_seq.mon_id. Should include the trailing period. | |
index: Which item of loop data should serve as the key. | |
parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython | |
parser. | |
Returns: | |
Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop, | |
indexed by the index column. | |
""" | |
entries = mmcif_loop_to_list(prefix, parsed_info) | |
return {entry[index]: entry for entry in entries} | |
def parse(*, | |
file_id: str, | |
mmcif_string: str, | |
catch_all_errors: bool = True) -> ParsingResult: | |
"""Entry point, parses an mmcif_string. | |
Args: | |
file_id: A string identifier for this file. Should be unique within the | |
collection of files being processed. | |
mmcif_string: Contents of an mmCIF file. | |
catch_all_errors: If True, all exceptions are caught and error messages are | |
returned as part of the ParsingResult. If False exceptions will be allowed | |
to propagate. | |
Returns: | |
A ParsingResult. | |
""" | |
errors = {} | |
try: | |
parser = PDB.MMCIFParser(QUIET=True) | |
handle = io.StringIO(mmcif_string) | |
full_structure = parser.get_structure('', handle) | |
first_model_structure = _get_first_model(full_structure) | |
# Extract the _mmcif_dict from the parser, which contains useful fields not | |
# reflected in the Biopython structure. | |
parsed_info = parser._mmcif_dict # pylint:disable=protected-access | |
# Ensure all values are lists, even if singletons. | |
for key, value in parsed_info.items(): | |
if not isinstance(value, list): | |
parsed_info[key] = [value] | |
header = _get_header(parsed_info) | |
# Determine the protein chains, and their start numbers according to the | |
# internal mmCIF numbering scheme (likely but not guaranteed to be 1). | |
valid_chains = _get_protein_chains(parsed_info=parsed_info) | |
if not valid_chains: | |
return ParsingResult( | |
None, {(file_id, ''): 'No protein chains found in this file.'}) | |
seq_start_num = {chain_id: min([monomer.num for monomer in seq]) | |
for chain_id, seq in valid_chains.items()} | |
# Loop over the atoms for which we have coordinates. Populate two mappings: | |
# -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used | |
# the authors / Biopython). | |
# -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition). | |
mmcif_to_author_chain_id = {} | |
seq_to_structure_mappings = {} | |
for atom in _get_atom_site_list(parsed_info): | |
if atom.model_num != '1': | |
# We only process the first model at the moment. | |
continue | |
mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id | |
if atom.mmcif_chain_id in valid_chains: | |
hetflag = ' ' | |
if atom.hetatm_atom == 'HETATM': | |
# Water atoms are assigned a special hetflag of W in Biopython. We | |
# need to do the same, so that this hetflag can be used to fetch | |
# a residue from the Biopython structure by id. | |
if atom.residue_name in ('HOH', 'WAT'): | |
hetflag = 'W' | |
else: | |
hetflag = 'H_' + atom.residue_name | |
insertion_code = atom.insertion_code | |
if not _is_set(atom.insertion_code): | |
insertion_code = ' ' | |
position = ResiduePosition(chain_id=atom.author_chain_id, | |
residue_number=int(atom.author_seq_num), | |
insertion_code=insertion_code) | |
seq_idx = int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id] | |
current = seq_to_structure_mappings.get(atom.author_chain_id, {}) | |
current[seq_idx] = ResidueAtPosition(position=position, | |
name=atom.residue_name, | |
is_missing=False, | |
hetflag=hetflag) | |
seq_to_structure_mappings[atom.author_chain_id] = current | |
# Add missing residue information to seq_to_structure_mappings. | |
for chain_id, seq_info in valid_chains.items(): | |
author_chain = mmcif_to_author_chain_id[chain_id] | |
current_mapping = seq_to_structure_mappings[author_chain] | |
for idx, monomer in enumerate(seq_info): | |
if idx not in current_mapping: | |
current_mapping[idx] = ResidueAtPosition(position=None, | |
name=monomer.id, | |
is_missing=True, | |
hetflag=' ') | |
author_chain_to_sequence = {} | |
for chain_id, seq_info in valid_chains.items(): | |
author_chain = mmcif_to_author_chain_id[chain_id] | |
seq = [] | |
for monomer in seq_info: | |
code = SCOPData.protein_letters_3to1.get(monomer.id, 'X') | |
seq.append(code if len(code) == 1 else 'X') | |
seq = ''.join(seq) | |
author_chain_to_sequence[author_chain] = seq | |
mmcif_object = MmcifObject( | |
file_id=file_id, | |
header=header, | |
structure=first_model_structure, | |
chain_to_seqres=author_chain_to_sequence, | |
seqres_to_structure=seq_to_structure_mappings, | |
raw_string=parsed_info) | |
return ParsingResult(mmcif_object=mmcif_object, errors=errors) | |
except Exception as e: # pylint:disable=broad-except | |
errors[(file_id, '')] = e | |
if not catch_all_errors: | |
raise | |
return ParsingResult(mmcif_object=None, errors=errors) | |
def _get_first_model(structure: PdbStructure) -> PdbStructure: | |
"""Returns the first model in a Biopython structure.""" | |
return next(structure.get_models()) | |
_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21 | |
def get_release_date(parsed_info: MmCIFDict) -> str: | |
"""Returns the oldest revision date.""" | |
revision_dates = parsed_info['_pdbx_audit_revision_history.revision_date'] | |
return min(revision_dates) | |
def _get_header(parsed_info: MmCIFDict) -> PdbHeader: | |
"""Returns a basic header containing method, release date and resolution.""" | |
header = {} | |
experiments = mmcif_loop_to_list('_exptl.', parsed_info) | |
header['structure_method'] = ','.join([ | |
experiment['_exptl.method'].lower() for experiment in experiments]) | |
# Note: The release_date here corresponds to the oldest revision. We prefer to | |
# use this for dataset filtering over the deposition_date. | |
if '_pdbx_audit_revision_history.revision_date' in parsed_info: | |
header['release_date'] = get_release_date(parsed_info) | |
else: | |
logging.warning('Could not determine release_date: %s', | |
parsed_info['_entry.id']) | |
header['resolution'] = 0.00 | |
for res_key in ('_refine.ls_d_res_high', '_em_3d_reconstruction.resolution', | |
'_reflns.d_resolution_high'): | |
if res_key in parsed_info: | |
try: | |
raw_resolution = parsed_info[res_key][0] | |
header['resolution'] = float(raw_resolution) | |
except ValueError: | |
logging.warning('Invalid resolution format: %s', parsed_info[res_key]) | |
return header | |
def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]: | |
"""Returns list of atom sites; contains data not present in the structure.""" | |
return [AtomSite(*site) for site in zip( # pylint:disable=g-complex-comprehension | |
parsed_info['_atom_site.label_comp_id'], | |
parsed_info['_atom_site.auth_asym_id'], | |
parsed_info['_atom_site.label_asym_id'], | |
parsed_info['_atom_site.auth_seq_id'], | |
parsed_info['_atom_site.label_seq_id'], | |
parsed_info['_atom_site.pdbx_PDB_ins_code'], | |
parsed_info['_atom_site.group_PDB'], | |
parsed_info['_atom_site.pdbx_PDB_model_num'], | |
)] | |
def _get_protein_chains( | |
*, parsed_info: Mapping[str, Any]) -> Mapping[ChainId, Sequence[Monomer]]: | |
"""Extracts polymer information for protein chains only. | |
Args: | |
parsed_info: _mmcif_dict produced by the Biopython parser. | |
Returns: | |
A dict mapping mmcif chain id to a list of Monomers. | |
""" | |
# Get polymer information for each entity in the structure. | |
entity_poly_seqs = mmcif_loop_to_list('_entity_poly_seq.', parsed_info) | |
polymers = collections.defaultdict(list) | |
for entity_poly_seq in entity_poly_seqs: | |
polymers[entity_poly_seq['_entity_poly_seq.entity_id']].append( | |
Monomer(id=entity_poly_seq['_entity_poly_seq.mon_id'], | |
num=int(entity_poly_seq['_entity_poly_seq.num']))) | |
# Get chemical compositions. Will allow us to identify which of these polymers | |
# are proteins. | |
chem_comps = mmcif_loop_to_dict('_chem_comp.', '_chem_comp.id', parsed_info) | |
# Get chains information for each entity. Necessary so that we can return a | |
# dict keyed on chain id rather than entity. | |
struct_asyms = mmcif_loop_to_list('_struct_asym.', parsed_info) | |
entity_to_mmcif_chains = collections.defaultdict(list) | |
for struct_asym in struct_asyms: | |
chain_id = struct_asym['_struct_asym.id'] | |
entity_id = struct_asym['_struct_asym.entity_id'] | |
entity_to_mmcif_chains[entity_id].append(chain_id) | |
# Identify and return the valid protein chains. | |
valid_chains = {} | |
for entity_id, seq_info in polymers.items(): | |
chain_ids = entity_to_mmcif_chains[entity_id] | |
# Reject polymers without any peptide-like components, such as DNA/RNA. | |
if any(['peptide' in chem_comps[monomer.id]['_chem_comp.type'] | |
for monomer in seq_info]): | |
for chain_id in chain_ids: | |
valid_chains[chain_id] = seq_info | |
return valid_chains | |
def _is_set(data: str) -> bool: | |
"""Returns False if data is a special mmCIF character indicating 'unset'.""" | |
return data not in ('.', '?') | |