import numpy as np from Bio.PDB import PDBParser, Selection from Bio.PDB.Polypeptide import three_to_one from Bio import pairwise2 from Bio.Align import substitution_matrices from diffab.tools.eval.base import EvalTask def reslist_rmsd(res_list1, res_list2): res_short, res_long = (res_list1, res_list2) if len(res_list1) < len(res_list2) else (res_list2, res_list1) M, N = len(res_short), len(res_long) def d(i, j): coord_i = np.array(res_short[i]['CA'].get_coord()) coord_j = np.array(res_long[j]['CA'].get_coord()) return ((coord_i - coord_j) ** 2).sum() SD = np.full([M, N], np.inf) for i in range(M): j = N - (M - i) SD[i, j] = sum([ d(i+k, j+k) for k in range(N-j) ]) for j in range(N): SD[M-1, j] = d(M-1, j) for i in range(M-2, -1, -1): for j in range((N-(M-i))-1, -1, -1): SD[i, j] = min( d(i, j) + SD[i+1, j+1], SD[i, j+1] ) min_SD = SD[0, :N-M+1].min() best_RMSD = np.sqrt(min_SD / M) return best_RMSD def entity_to_seq(entity): seq = '' mapping = [] for res in Selection.unfold_entities(entity, 'R'): try: seq += three_to_one(res.get_resname()) mapping.append(res.get_id()) except KeyError: pass assert len(seq) == len(mapping) return seq, mapping def reslist_seqid(res_list1, res_list2): seq1, _ = entity_to_seq(res_list1) seq2, _ = entity_to_seq(res_list2) _, seq_id = align_sequences(seq1, seq2) return seq_id def align_sequences(sequence_A, sequence_B, **kwargs): """ Performs a global pairwise alignment between two sequences using the BLOSUM62 matrix and the Needleman-Wunsch algorithm as implemented in Biopython. Returns the alignment, the sequence identity and the residue mapping between both original sequences. """ def _calculate_identity(sequenceA, sequenceB): """ Returns the percentage of identical characters between two sequences. Assumes the sequences are aligned. """ sa, sb, sl = sequenceA, sequenceB, len(sequenceA) matches = [sa[i] == sb[i] for i in range(sl)] seq_id = (100 * sum(matches)) / sl return seq_id # gapless_sl = sum([1 for i in range(sl) if (sa[i] != '-' and sb[i] != '-')]) # gap_id = (100 * sum(matches)) / gapless_sl # return (seq_id, gap_id) # matrix = kwargs.get('matrix', substitution_matrices.load("BLOSUM62")) gap_open = kwargs.get('gap_open', -10.0) gap_extend = kwargs.get('gap_extend', -0.5) alns = pairwise2.align.globalds(sequence_A, sequence_B, matrix, gap_open, gap_extend, penalize_end_gaps=(False, False) ) best_aln = alns[0] aligned_A, aligned_B, score, begin, end = best_aln # Calculate sequence identity seq_id = _calculate_identity(aligned_A, aligned_B) return (aligned_A, aligned_B), seq_id def extract_reslist(model, residue_first, residue_last): assert residue_first[0] == residue_last[0] residue_first, residue_last = tuple(residue_first), tuple(residue_last) chain_id = residue_first[0] pos_first, pos_last = residue_first[1:], residue_last[1:] chain = model[chain_id] reslist = [] for res in Selection.unfold_entities(chain, 'R'): pos_current = (res.id[1], res.id[2]) if pos_first <= pos_current <= pos_last: reslist.append(res) return reslist def eval_similarity(task: EvalTask): model_gen = task.get_gen_biopython_model() model_ref = task.get_ref_biopython_model() reslist_gen = extract_reslist(model_gen, task.residue_first, task.residue_last) reslist_ref = extract_reslist(model_ref, task.residue_first, task.residue_last) task.scores.update({ 'rmsd': reslist_rmsd(reslist_gen, reslist_ref), 'seqid': reslist_seqid(reslist_gen, reslist_ref), }) return task