ProteinMPNN

Sleeping

ProteinMPNN / af_backprop /alphafold /data /parsers.py

Simon Duerr

add fast af

85bd48b over 3 years ago

12.9 kB

	# Copyright 2021 DeepMind Technologies Limited
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Functions for parsing various file formats."""
	import collections
	import dataclasses
	import re
	import string
	from typing import Dict, Iterable, List, Optional, Sequence, Tuple

	DeletionMatrix = Sequence[Sequence[int]]


	@dataclasses.dataclass(frozen=True)
	class TemplateHit:
	"""Class representing a template hit."""
	index: int
	name: str
	aligned_cols: int
	sum_probs: float
	query: str
	hit_sequence: str
	indices_query: List[int]
	indices_hit: List[int]


	def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
	"""Parses FASTA string and returns list of strings with amino-acid sequences.

	Arguments:
	fasta_string: The string contents of a FASTA file.

	Returns:
	A tuple of two lists:
	* A list of sequences.
	* A list of sequence descriptions taken from the comment lines. In the
	same order as the sequences.
	"""
	sequences = []
	descriptions = []
	index = -1
	for line in fasta_string.splitlines():
	line = line.strip()
	if line.startswith('>'):
	index += 1
	descriptions.append(line[1:]) # Remove the '>' at the beginning.
	sequences.append('')
	continue
	elif not line:
	continue # Skip blank lines.
	sequences[index] += line

	return sequences, descriptions


	def parse_stockholm(
	stockholm_string: str
	) -> Tuple[Sequence[str], DeletionMatrix, Sequence[str]]:
	"""Parses sequences and deletion matrix from stockholm format alignment.

	Args:
	stockholm_string: The string contents of a stockholm file. The first
	sequence in the file should be the query sequence.

	Returns:
	A tuple of:
	* A list of sequences that have been aligned to the query. These
	might contain duplicates.
	* The deletion matrix for the alignment as a list of lists. The element
	at `deletion_matrix[i][j]` is the number of residues deleted from
	the aligned sequence i at residue position j.
	* The names of the targets matched, including the jackhmmer subsequence
	suffix.
	"""
	name_to_sequence = collections.OrderedDict()
	for line in stockholm_string.splitlines():
	line = line.strip()
	if not line or line.startswith(('#', '//')):
	continue
	name, sequence = line.split()
	if name not in name_to_sequence:
	name_to_sequence[name] = ''
	name_to_sequence[name] += sequence

	msa = []
	deletion_matrix = []

	query = ''
	keep_columns = []
	for seq_index, sequence in enumerate(name_to_sequence.values()):
	if seq_index == 0:
	# Gather the columns with gaps from the query
	query = sequence
	keep_columns = [i for i, res in enumerate(query) if res != '-']

	# Remove the columns with gaps in the query from all sequences.
	aligned_sequence = ''.join([sequence[c] for c in keep_columns])

	msa.append(aligned_sequence)

	# Count the number of deletions w.r.t. query.
	deletion_vec = []
	deletion_count = 0
	for seq_res, query_res in zip(sequence, query):
	if seq_res != '-' or query_res != '-':
	if query_res == '-':
	deletion_count += 1
	else:
	deletion_vec.append(deletion_count)
	deletion_count = 0
	deletion_matrix.append(deletion_vec)

	return msa, deletion_matrix, list(name_to_sequence.keys())


	def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
	"""Parses sequences and deletion matrix from a3m format alignment.

	Args:
	a3m_string: The string contents of a a3m file. The first sequence in the
	file should be the query sequence.

	Returns:
	A tuple of:
	* A list of sequences that have been aligned to the query. These
	might contain duplicates.
	* The deletion matrix for the alignment as a list of lists. The element
	at `deletion_matrix[i][j]` is the number of residues deleted from
	the aligned sequence i at residue position j.
	"""
	sequences, _ = parse_fasta(a3m_string)
	deletion_matrix = []
	for msa_sequence in sequences:
	deletion_vec = []
	deletion_count = 0
	for j in msa_sequence:
	if j.islower():
	deletion_count += 1
	else:
	deletion_vec.append(deletion_count)
	deletion_count = 0
	deletion_matrix.append(deletion_vec)

	# Make the MSA matrix out of aligned (deletion-free) sequences.
	deletion_table = str.maketrans('', '', string.ascii_lowercase)
	aligned_sequences = [s.translate(deletion_table) for s in sequences]
	return aligned_sequences, deletion_matrix


	def _convert_sto_seq_to_a3m(
	query_non_gaps: Sequence[bool], sto_seq: str) -> Iterable[str]:
	for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
	if is_query_res_non_gap:
	yield sequence_res
	elif sequence_res != '-':
	yield sequence_res.lower()


	def convert_stockholm_to_a3m(stockholm_format: str,
	max_sequences: Optional[int] = None) -> str:
	"""Converts MSA in Stockholm format to the A3M format."""
	descriptions = {}
	sequences = {}
	reached_max_sequences = False

	for line in stockholm_format.splitlines():
	reached_max_sequences = max_sequences and len(sequences) >= max_sequences
	if line.strip() and not line.startswith(('#', '//')):
	# Ignore blank lines, markup and end symbols - remainder are alignment
	# sequence parts.
	seqname, aligned_seq = line.split(maxsplit=1)
	if seqname not in sequences:
	if reached_max_sequences:
	continue
	sequences[seqname] = ''
	sequences[seqname] += aligned_seq

	for line in stockholm_format.splitlines():
	if line[:4] == '#=GS':
	# Description row - example format is:
	# #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ...
	columns = line.split(maxsplit=3)
	seqname, feature = columns[1:3]
	value = columns[3] if len(columns) == 4 else ''
	if feature != 'DE':
	continue
	if reached_max_sequences and seqname not in sequences:
	continue
	descriptions[seqname] = value
	if len(descriptions) == len(sequences):
	break

	# Convert sto format to a3m line by line
	a3m_sequences = {}
	# query_sequence is assumed to be the first sequence
	query_sequence = next(iter(sequences.values()))
	query_non_gaps = [res != '-' for res in query_sequence]
	for seqname, sto_sequence in sequences.items():
	a3m_sequences[seqname] = ''.join(
	_convert_sto_seq_to_a3m(query_non_gaps, sto_sequence))

	fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
	for k in a3m_sequences)
	return '\n'.join(fasta_chunks) + '\n' # Include terminating newline.


	def _get_hhr_line_regex_groups(
	regex_pattern: str, line: str) -> Sequence[Optional[str]]:
	match = re.match(regex_pattern, line)
	if match is None:
	raise RuntimeError(f'Could not parse query line {line}')
	return match.groups()


	def _update_hhr_residue_indices_list(
	sequence: str, start_index: int, indices_list: List[int]):
	"""Computes the relative indices for each residue with respect to the original sequence."""
	counter = start_index
	for symbol in sequence:
	if symbol == '-':
	indices_list.append(-1)
	else:
	indices_list.append(counter)
	counter += 1


	def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
	"""Parses the detailed HMM HMM comparison section for a single Hit.

	This works on .hhr files generated from both HHBlits and HHSearch.

	Args:
	detailed_lines: A list of lines from a single comparison section between 2
	sequences (which each have their own HMM's)

	Returns:
	A dictionary with the information from that detailed comparison section

	Raises:
	RuntimeError: If a certain line cannot be processed
	"""
	# Parse first 2 lines.
	number_of_hit = int(detailed_lines[0].split()[-1])
	name_hit = detailed_lines[1][1:]

	# Parse the summary line.
	pattern = (
	'Probab=(.)[\t ]E-value=(.)[\t ]Score=(.)[\t ]Aligned_cols=(.*)[\t'
	' ]Identities=(.)%[\t ]Similarity=(.)[\t ]Sum_probs=(.)[\t '
	']Template_Neff=(.)')
	match = re.match(pattern, detailed_lines[2])
	if match is None:
	raise RuntimeError(
	'Could not parse section: %s. Expected this: \n%s to contain summary.' %
	(detailed_lines, detailed_lines[2]))
	(prob_true, e_value, _, aligned_cols, _, _, sum_probs,
	neff) = [float(x) for x in match.groups()]

	# The next section reads the detailed comparisons. These are in a 'human
	# readable' format which has a fixed length. The strategy employed is to
	# assume that each block starts with the query sequence line, and to parse
	# that with a regexp in order to deduce the fixed length used for that block.
	query = ''
	hit_sequence = ''
	indices_query = []
	indices_hit = []
	length_block = None

	for line in detailed_lines[3:]:
	# Parse the query sequence line
	if (line.startswith('Q ') and not line.startswith('Q ss_dssp') and
	not line.startswith('Q ss_pred') and
	not line.startswith('Q Consensus')):
	# Thus the first 17 characters must be 'Q <query_name> ', and we can parse
	# everything after that.
	# start sequence end total_sequence_length
	patt = r'[\t ]([0-9]) ([A-Z-])[\t ]([0-9]) \([0-9]\)'
	groups = _get_hhr_line_regex_groups(patt, line[17:])

	# Get the length of the parsed block using the start and finish indices,
	# and ensure it is the same as the actual block length.
	start = int(groups[0]) - 1 # Make index zero based.
	delta_query = groups[1]
	end = int(groups[2])
	num_insertions = len([x for x in delta_query if x == '-'])
	length_block = end - start + num_insertions
	assert length_block == len(delta_query)

	# Update the query sequence and indices list.
	query += delta_query
	_update_hhr_residue_indices_list(delta_query, start, indices_query)

	elif line.startswith('T '):
	# Parse the hit sequence.
	if (not line.startswith('T ss_dssp') and
	not line.startswith('T ss_pred') and
	not line.startswith('T Consensus')):
	# Thus the first 17 characters must be 'T <hit_name> ', and we can
	# parse everything after that.
	# start sequence end total_sequence_length
	patt = r'[\t ]([0-9]) ([A-Z-])[\t ][0-9]* \([0-9]*\)'
	groups = _get_hhr_line_regex_groups(patt, line[17:])
	start = int(groups[0]) - 1 # Make index zero based.
	delta_hit_sequence = groups[1]
	assert length_block == len(delta_hit_sequence)

	# Update the hit sequence and indices list.
	hit_sequence += delta_hit_sequence
	_update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit)

	return TemplateHit(
	index=number_of_hit,
	name=name_hit,
	aligned_cols=int(aligned_cols),
	sum_probs=sum_probs,
	query=query,
	hit_sequence=hit_sequence,
	indices_query=indices_query,
	indices_hit=indices_hit,
	)


	def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
	"""Parses the content of an entire HHR file."""
	lines = hhr_string.splitlines()

	# Each .hhr file starts with a results table, then has a sequence of hit
	# "paragraphs", each paragraph starting with a line 'No <hit number>'. We
	# iterate through each paragraph to parse each hit.

	block_starts = [i for i, line in enumerate(lines) if line.startswith('No ')]

	hits = []
	if block_starts:
	block_starts.append(len(lines)) # Add the end of the final block.
	for i in range(len(block_starts) - 1):
	hits.append(_parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]]))
	return hits


	def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
	"""Parse target to e-value mapping parsed from Jackhmmer tblout string."""
	e_values = {'query': 0}
	lines = [line for line in tblout.splitlines() if line[0] != '#']
	# As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
	# space-delimited. Relevant fields are (1) target name: and
	# (5) E-value (full sequence) (numbering from 1).
	for line in lines:
	fields = line.split()
	e_value = fields[4]
	target_name = fields[0]
	e_values[target_name] = float(e_value)
	return e_values