from collections import OrderedDict from typing import Union, List, Generator, Tuple from Bio import SeqIO from Bio.SeqRecord import SeqRecord import pandas as pd from abnumber.alignment import Alignment from abnumber.common import _anarci_align, _validate_chain_type, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS, \ is_integer, SCHEME_BORDERS, _get_unique_chains from abnumber.exceptions import ChainParseError import numpy as np from Bio.Seq import Seq from abnumber.position import Position class Chain: """ Antibody chain aligned to a chosen antibody numbering scheme :example: >>> from abnumber import Chain >>> >>> seq = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAPSVYPLA' >>> chain = Chain(seq, scheme='imgt') >>> chain QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^ Chain can be iterated: >>> for pos, aa in chain: >>> print(pos, aa) H1 Q H2 V H3 Q H4 L H5 Q ... Chain can also be indexed and sliced using scheme numbering: >>> chain['5'] 'Q' >>> for pos, aa in chain['H2':'H5']: >>> print(pos, aa) H2 V H3 Q H4 L H5 Q :param sequence: Unaligned string sequence :param name: Optional sequence identifier :param scheme: Numbering scheme: One of ``imgt``, ``chothia``, ``kabat``, ``aho`` :param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default. One of ``imgt``, ``chothia``, ``kabat``, ``north``. Required for ``aho``. :param assign_germline: Assign germline name using ANARCI based on best sequence identity :param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'`` :param aa_dict: (Internal use only) Create Chain object directly from dictionary of region objects (internal use) :param tail: (Internal use only) Constant region sequence :param species: (Internal use only) Species as identified by ANARCI :param germline: (Internal use only) Germline as identified by ANARCI """ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germline=False, allowed_species=None, **kwargs): aa_dict = kwargs.pop('aa_dict', None) chain_type = kwargs.pop('chain_type', None) tail = kwargs.pop('tail', None) species = kwargs.pop('species', None) v_gene = kwargs.pop('v_gene', None) j_gene = kwargs.pop('j_gene', None) if isinstance(allowed_species, str): allowed_species = [allowed_species] if len(kwargs): raise TypeError(f'Argument not recognized: {", ".join(kwargs)}') if aa_dict is not None: if sequence is not None: raise ChainParseError('Only one of aa_dict= and sequence= can be provided') assert isinstance(aa_dict, dict), f'Expected dict, got: {type(aa_dict)}' assert tail is not None assert chain_type is not None else: if sequence is None: raise ChainParseError('Expected sequence, got None') if not isinstance(sequence, str) and not isinstance(sequence, Seq): raise ChainParseError(f'Expected string or Seq, got {type(sequence)}: {sequence}') if '-' in sequence: raise ChainParseError(f'Please provide an unaligned sequence, got: {sequence}') if chain_type is not None: raise ChainParseError('Do not use chain_type= when providing sequence=, it will be inferred automatically') if tail is not None: raise ChainParseError('Do not use tail= when providing sequence=, it will be inferred automatically') if isinstance(sequence, Seq): sequence = str(sequence) results = _anarci_align(sequence, scheme=scheme, allowed_species=allowed_species, assign_germline=assign_germline) if len(results) > 1: raise ChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"') aa_dict, chain_type, tail, species, v_gene, j_gene = results[0] _validate_chain_type(chain_type) self.name: str = name """User-provided sequence identifier""" self.chain_type: str = chain_type """Chain type as identified by ANARCI: ``H`` (heavy), ``K`` (kappa light) or ``L`` (lambda light) See also :meth:`Chain.is_heavy_chain` and :meth:`Chain.is_light_chain`. """ self.scheme: str = scheme """Numbering scheme used to align the sequence""" self.cdr_definition: str = cdr_definition or scheme """Numbering scheme to be used for definition of CDR regions (same as ``scheme`` by default)""" self.tail: str = tail """Constant region sequence""" self.species: str = species """Species as identified by ANARCI""" self.v_gene: str = v_gene """V gene germline as identified by ANARCI (if assign_germline is True)""" self.j_gene: str = j_gene """J gene germline as identified by ANARCI (if assign_germline is True)""" self.fr1_dict = OrderedDict() self.cdr1_dict = OrderedDict() self.fr2_dict = OrderedDict() self.cdr2_dict = OrderedDict() self.fr3_dict = OrderedDict() self.cdr3_dict = OrderedDict() self.fr4_dict = OrderedDict() self._init_from_dict(aa_dict, allowed_species=allowed_species) def _init_from_dict(self, aa_dict, allowed_species): if self.scheme not in SUPPORTED_SCHEMES: raise NotImplementedError(f'Scheme "{self.scheme}" is not supported. Available schemes: {", ".join(SUPPORTED_SCHEMES)}') if self.cdr_definition in ['aho']: raise ValueError('CDR regions are not defined for AHo, ' 'you need to specify cdr_definition="chothia" or another scheme for CDR extraction.') if self.cdr_definition not in SUPPORTED_CDR_DEFINITIONS: raise NotImplementedError(f'CDR definition "{self.scheme}" is not supported. Available definitions: {", ".join(SUPPORTED_SCHEMES)}') # list of region start positions borders = SCHEME_BORDERS[self.cdr_definition] if self.cdr_definition in SCHEME_BORDERS else SCHEME_BORDERS[f'{self.cdr_definition}_{self.chain_type}'] regions_list = [self.fr1_dict, self.cdr1_dict, self.fr2_dict, self.cdr2_dict, self.fr3_dict, self.cdr3_dict, self.fr4_dict] region_idx = 0 sorted_positions = sorted(aa_dict.keys()) cdr_definition_ready = True for pos in sorted_positions: assert pos.scheme == self.scheme, f'Schemes of provided position ({pos.scheme}) does not match Chain scheme ({self.scheme})' if pos.cdr_definition != self.cdr_definition: cdr_definition_ready = False if cdr_definition_ready: combined_aa_dict = aa_dict else: seq = ''.join(aa_dict[pos] for pos in sorted_positions) renumbered_aa_dict = _anarci_align( seq, scheme=self.cdr_definition if self.cdr_definition != 'north' else 'chothia', allowed_species=allowed_species )[0][0] cdr_definition_positions = [pos.number for pos in sorted(renumbered_aa_dict.keys())] combined_aa_dict = {} for orig_pos, cdr_definition_position in zip(sorted_positions, cdr_definition_positions): aa = aa_dict[orig_pos] pos = orig_pos.copy() pos.set_cdr_definition(self.cdr_definition, cdr_definition_position) combined_aa_dict[pos] = aa for pos in sorted(combined_aa_dict.keys()): assert isinstance(pos, Position), f'Expected Position object, got {type(pos)}: {pos}' aa = combined_aa_dict[pos].upper().strip() if aa in [None, '*', '-', '', '.']: continue while pos.cdr_definition_position >= borders[region_idx]: region_idx += 1 regions_list[region_idx][pos] = aa def __repr__(self): return self.format() def __str__(self): return self.seq def __iter__(self): yield from self.positions.items().__iter__() def __getitem__(self, item): if isinstance(item, slice): if item.step is not None and item.step != 1: raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}') return self.slice(start=item.start, stop=item.stop) pos = self._parse_position(item) return self.positions[pos] def __len__(self): return len(self.positions) def __hash__(self): return hash(self.positions) def __eq__(self, other): """Check chain equality. Only checks scheme, aligned sequence and tail sequence, ignores name, metadata and CDR definitions.""" assert isinstance(other, Chain), f'Can only compare Chain to another Chain, got {type(other)}: {other}' return self.positions == other.positions and self.tail == other.tail @classmethod def to_fasta(cls, chains, path_or_fd, keep_tail=False, description=''): """Save multiple chains to FASTA""" if isinstance(chains, Chain): records = chains.to_seq_record(keep_tail=keep_tail, description=description) else: records = (chain.to_seq_record(keep_tail=keep_tail, description=description) for chain in chains) return SeqIO.write(records, path_or_fd, 'fasta-2line') @classmethod def from_fasta(cls, path_or_handle, scheme, cdr_definition=None, as_series=False, as_generator=False, **kwargs) -> Union[List['Chain'], pd.Series, Generator['Chain', None, None]]: """Read multiple chains from FASTA""" generator = (cls(record.seq, name=record.name, scheme=scheme, cdr_definition=cdr_definition, **kwargs) for record in SeqIO.parse(path_or_handle, 'fasta')) if as_generator: return generator chains = list(generator) if as_series: return pd.Series(chains, index=[c.name for c in chains]) return chains def to_seq_record(self, keep_tail=False, description=''): """Create BioPython SeqRecord object from this Chain""" if not self.name: raise ValueError('Name needs to be present to convert to a SeqRecord') seq = Seq(self.seq + self.tail if keep_tail else self.seq) return SeqRecord(seq, id=self.name, description=description) @classmethod def to_anarci_csv(cls, chains: List['Chain'], path): """Save multiple chains to ANARCI-like CSV""" df = cls.to_dataframe(chains) df.to_csv(path) @classmethod def to_dataframe(cls, chains: List['Chain']): """Produce a Pandas dataframe with aligned chain sequences in the columns Note: Contains only positions (columns) that are present in the provided chains, so number of columns can differ based on the input. """ series_list = [chain.to_series() for chain in chains] # Each chain can have a different set of positions # so we need to sort the columns to make sure they are in the right order # this is using the correct Position sorting columns = set(c for series in series_list for c in series.index) prop_columns = [c for c in columns if not isinstance(c, Position)] position_columns = sorted([c for c in columns if isinstance(c, Position)]) # Columns can come from K and L chain, so we need to convert them to string and remove duplicates here position_columns_str = pd.Series( [pos.format(chain_type=False) for pos in position_columns] ).drop_duplicates().to_list() # Get full list of string columns columns_str = prop_columns + position_columns_str # Reindex each series using ordered list of string columns series_list_ordered = [] for series in series_list: series.index = series.index.map(lambda pos: pos.format(chain_type=False)) series_list_ordered.append(series.reindex(columns_str)) df = pd.DataFrame(series_list_ordered)[columns_str].fillna('-') df.index.name = 'Id' return df def to_series(self): props = { 'chain_type': self.chain_type, 'species': self.species } return pd.Series({**props, **self.positions}, name=self.name) @classmethod def from_series(cls, series, scheme, cdr_definition=None) -> 'Chain': chain_type = series['chain_type'] species = series.get('species') position_index = [c for c in series.index if c[:1].isnumeric()] aa_dict = {Position.from_string(pos, chain_type=chain_type, scheme=scheme): aa for pos, aa in series[position_index].items() if aa != '-' and not pd.isna(aa)} return cls(sequence=None, aa_dict=aa_dict, name=series.name, scheme=scheme, cdr_definition=cdr_definition, chain_type=chain_type, species=species, tail='') @classmethod def from_anarci_csv(cls, path, scheme, cdr_definition=None, as_series=False) -> Union[List['Chain'], pd.Series]: df = pd.read_csv(path, index_col=0) return cls.from_dataframe(df, scheme=scheme, cdr_definition=cdr_definition, as_series=as_series) @classmethod def from_dataframe(cls, df, scheme, cdr_definition=None, as_series=False) -> Union[List['Chain'], pd.Series]: chains = [cls.from_series(series, scheme=scheme, cdr_definition=cdr_definition) for i, series in df.iterrows()] if as_series: return pd.Series(chains, index=[c.name for c in chains]) return chains def format(self, method='wide', **kwargs): """Format sequence to string :param method: use ``"wide"`` for :meth:`Chain.format_wide` or ``"tall"`` for :meth:`Chain.format_tall()` :return: formatted string """ if method == 'wide': return self.format_wide(**kwargs) elif method == 'tall': return self.format_tall(**kwargs) raise ValueError(f'Use method="wide" or method="tall", unknown method: "{method}"') def print(self, method='wide', **kwargs): """Print string representation using :meth:`Chain.format` By default, produces "wide" format with sequence on first line and CDR regions higlighted with ``^`` on second line: >>> chain.print() QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^ :param method: use ``"wide"`` for :meth:`Chain.format_wide` or ``"tall"`` for :meth:`Chain.format_tall()` """ print(self.format(method=method, **kwargs)) def format_tall(self, columns=5): """Create string with one position per line, showing position numbers and amino acids :return: formatted string """ height = int(np.ceil(len(self) / columns)) rows = [''] * height for column, start in enumerate(range(0, len(self), height)): chain_slice = self.raw[start:start+height] for row, (pos, aa) in enumerate(chain_slice): rows[row] = rows[row].ljust(column * 15) pos_format = (pos.get_region() + ' ' if pos.is_in_cdr() else '') + pos.format() rows[row] += f'{pos_format.rjust(9)} {aa}' return '\n'.join(rows) def print_tall(self, columns=5): """Print string representation using :meth:`Chain.format_tall` >>> chain.print_tall() FR1 H1 Q FR1 H2 V FR1 H3 Q FR1 H4 L FR1 H5 Q FR1 H6 Q FR1 H7 S ... """ print(self.format_tall(columns=columns)) def format_wide(self, numbering=False): """Create string with sequence on first line and CDR regions higlighted with `^` on second line :param numbering: Add position numbers on top :return: formatted string """ lines = [] if numbering: first_order = '' prev_number = None after_double_digit = False for pos in self.positions: number = str(pos.number // 10) if number != prev_number: if after_double_digit: # Special case: when double digits follow another double digits, do not print the first digit number = number[1:] first_order += number if len(number) > 1: after_double_digit = True else: if after_double_digit: # Special case: After 10, 11, etc, skip adding the space after_double_digit = False else: first_order += ' ' prev_number = number lines.append(first_order) lines.append(''.join(str(pos.number % 10) for pos in self.positions)) letters = ''.join(pos.letter or ' ' for pos in self.positions) if letters.strip(): lines.append(letters) lines.append(self.seq) if self.cdr_definition == 'kabat': lines.append(''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions)) else: lines.append(''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions)) return '\n'.join(lines) def print_wide(self, numbering=False): """Print string representation using :meth:`Chain.format_wide` >>> chain.print_wide() QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^ """ print(self.format_wide(numbering=numbering)) def is_heavy_chain(self): """Check if this chain is heavy chain (``chain_type=="H"``)""" return self.chain_type == 'H' def is_light_chain(self): """Check if this chain is light chain (``chain_type=="K" or chain_type=="L"``)""" return self.is_lambda_light_chain() or self.is_kappa_light_chain() def is_lambda_light_chain(self): """Check if this chain is lambda light chain (``chain_type=="L"``)""" return self.chain_type == 'L' def is_kappa_light_chain(self): """Check if this chain is kappa light chain (``chain_type=="K"``)""" return self.chain_type == 'K' def align(self, *other) -> 'Alignment': """Align this chain to other chains by using their existing numbering >>> from abnumber import Chain >>> >>> seq1 = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAP' >>> chain1 = Chain(seq1, scheme='imgt') >>> >>> seq2 = 'QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYDDYLDRWGQGTTLTVSSAKTTAP' >>> chain2 = Chain(seq2, scheme='imgt') >>> >>> alignment = chain1.align(chain2) >>> print(alignment.format()) QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS ||||.||||||.||||+|||||||||||.||||||||||||||||+||||||||.|.||||||||||||||||||||||||||.+|||||||||||||||||....||.||||||||||| QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS ^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^ :param other: The :class:`Chain` object to align, can be repeated to create a multiple sequence alignment :return: :class:`Alignment` object """ pos_dicts = [self.positions] for chain in other: assert isinstance(chain, Chain), f'Expected Chain object, got {type(chain)}: {chain}' pos_dicts.append(chain.positions) unique_cdr_definitions = set(pos.cdr_definition for pos_dict in pos_dicts for pos in pos_dict.keys()) assert len(unique_cdr_definitions) <= 1, f'Aligned chains should use the same CDR definitions, got: {unique_cdr_definitions}' shared_pos = sorted(set(pos for pos_dict in pos_dicts for pos in pos_dict.keys())) residues = [tuple(pos_dict.get(pos, '-') for pos_dict in pos_dicts) for pos in shared_pos] return Alignment(shared_pos, residues, chain_type=self.chain_type, scheme=self.scheme) def clone(self, replace_seq: str = None): """Create a copy of this chain, optionally with a replacement sequence :param replace_seq: Optional replacement sequence, needs to be the same length :return: new Chain object """ return self.slice(replace_seq=replace_seq) def slice(self, replace_seq: str = None, start: Union[str, int, 'Position'] = None, stop: Union[str, int, 'Position'] = None, stop_inclusive: bool = True, allow_raw: bool = False): """Create a slice of this chain, optionally with a replacement sequence that is placed into the same numbering You can also slice directly using ``chain['111':'112A']`` or ``chain.raw[10:20]``. :param replace_seq: Optional replacement sequence, needs to be the same length :param start: Optional slice start position (inclusive), :class:`Position` or string (e.g. '111A') :param stop: Optional slice stop position (inclusive), :class:`Position` or string (e.g. '112A') :param stop_inclusive: Include stop position in slice :param allow_raw: Allow unaligned numeric indexing from 0 to length of sequence - 1 :return: new Chain object """ aa_dict = {} positions = self.positions if replace_seq is not None: assert len(replace_seq) == len(positions), 'Sequence needs to be the same length' start = self._parse_position(start, allow_raw=allow_raw) if start is not None else None stop = self._parse_position(stop, allow_raw=allow_raw) if stop is not None else None for i, (pos, aa) in enumerate(positions.items()): if start is not None and pos < start: continue if stop is not None and (pos > stop or (not stop_inclusive and pos >= stop)): break aa_dict[pos] = replace_seq[i] if replace_seq is not None else aa return Chain( sequence=None, aa_dict=aa_dict, name=self.name, scheme=self.scheme, chain_type=self.chain_type, cdr_definition=self.cdr_definition, tail=self.tail, species=self.species, v_gene=self.v_gene, j_gene=self.j_gene ) def renumber(self, scheme=None, cdr_definition=None, allowed_species=None): """Return copy of this chain aligned using a different numbering scheme or CDR definition :param scheme: Change numbering scheme: One of ``imgt``, ``chothia``, ``kabat``, ``aho``. :param cdr_definition: Change CDR definition scheme: One of ``imgt``, ``chothia``, ``kabat``, ``north``. :param allowed_species: ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'`` """ return Chain( self.seq + self.tail, name=self.name, allowed_species=allowed_species, scheme=scheme or self.scheme, cdr_definition=cdr_definition or scheme or self.cdr_definition, assign_germline=self.v_gene is not None ) def graft_cdrs_onto(self, other: 'Chain', backmutate_vernier=False, backmutations: List[Union['Position',str]] = [], name: str = None) -> 'Chain': """Graft CDRs from this Chain onto another chain :param other: Chain to graft CDRs into (source of frameworks and tail sequence) :param backmutate_vernier: Also graft all Kabat Vernier positions from this chain (perform backmutations) :param backmutations: List of positions that should additionally be grafted from this chain (str or or :class:`Position`) :param name: Name of new Chain. If not provided, use name of this chain. :return: Chain with CDRs grafted from this chain and frameworks from the given chain """ assert self.scheme == other.scheme, \ f'Sequences need to have the same numbering scheme, got {self.scheme} and {other.scheme}' assert self.cdr_definition == other.cdr_definition, \ f'Sequences need to have the same CDR definition, got {self.cdr_definition} and {other.cdr_definition}' assert self.chain_type == other.chain_type, \ f'Sequences need to have the same chain type, got {self.chain_type} and {other.chain_type}' backmutations = [self._parse_position(pos) for pos in backmutations] grafted_dict = {pos: aa for pos, aa in other if not pos.is_in_cdr()} for pos, aa in self: if pos.is_in_cdr() or (backmutate_vernier and pos.is_in_vernier()) or pos in backmutations: grafted_dict[pos] = aa return Chain(sequence=None, aa_dict=grafted_dict, name=name or self.name, chain_type=self.chain_type, scheme=self.scheme, cdr_definition=self.cdr_definition, tail=other.tail, v_gene=other.v_gene, j_gene=other.j_gene) def graft_cdrs_onto_human_germline(self, v_gene=None, j_gene=None, backmutate_vernier=False, backmutations: List[Union['Position',str]] = []): """Graft CDRs from this Chain onto the nearest human germline sequence :param v_gene: Use defined V germline allele (e.g. IGHV1-18*01), gene (e.g. IGHV1-18) or family (e.g. IGHV1) :param j_gene: Use defined J germline allele (e.g. IGHJ1*01) or gene (e.g. IGHJ1) :param backmutate_vernier: Also graft all Kabat Vernier positions from this chain (perform backmutations) :param backmutations: List of positions that should additionally be grafted from this chain (str or or :class:`Position`) :return: Chain with CDRs grafted from this chain and frameworks from TODO """ germline_chain = self.find_merged_human_germline(v_gene=v_gene, j_gene=j_gene) if self.scheme != 'imgt' or self.cdr_definition != 'imgt': germline_chain = germline_chain.renumber(self.scheme, self.cdr_definition) return self.graft_cdrs_onto(germline_chain, backmutate_vernier=backmutate_vernier, backmutations=backmutations) def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False): """Create :class:`Position` key object from string or int. Note: The position should only be used for indexing, CDR definition is not preserved! :param position: Numeric or string position representation :param allow_raw: Also allow unaligned numeric (int) indexing from 0 to length of sequence - 1 :return: new Position object, should only be used for indexing, CDR definition is not preserved! """ if isinstance(position, str): return Position.from_string(position, chain_type=self.chain_type, scheme=self.scheme) if isinstance(position, Position): return position try: position = int(position) except TypeError: raise IndexError(f'Invalid position key, expected Position, string or integer, got {type(position)}: "{position}"') if not allow_raw: raise IndexError("Use chain.raw[i] for raw numeric indexing or pass allow_raw=True. " "For named position indexing, use string (e.g. chain['111A'] or chain['H111A'])") if position >= len(self.positions): return None return self.get_position_by_raw_index(position) def get_position_by_raw_index(self, index): """Get Position object at corresponding raw numeric position""" return list(self.positions.keys())[index] def find_human_germlines(self, limit=10, v_gene=None, j_gene=None, unique=True) -> Tuple[List['Chain'], List['Chain']]: """Find most identical V and J germline sequences based on IMGT alignment :param limit: Number of best matching germlines to return :param v_gene: Filter germlines to specific V gene name :param j_gene: Filter germlines to specific J gene name :param unique: Skip germlines with duplicate amino acid sequence :return: list of top V chains, list of top J chains """ from abnumber.germlines import get_imgt_v_chains, get_imgt_j_chains chain = self if self.scheme == 'imgt' and self.cdr_definition == 'imgt' else self.renumber('imgt') v_chains = list(get_imgt_v_chains(chain.chain_type).values()) j_chains = list(get_imgt_j_chains(chain.chain_type).values()) if v_gene: if v_gene.startswith('IGKV') and self.chain_type == 'L': raise NotImplementedError('Cannot graft lambda chain into kappa chain') if v_gene.startswith('IGLV') and self.chain_type == 'K': raise NotImplementedError('Cannot graft kappa chain into lambda chain') v_chains = [chain for chain in v_chains if chain.name.startswith(v_gene)] if not v_chains: print('Available V genes:', get_imgt_v_chains(chain.chain_type).keys()) raise ValueError(f'No V genes found for "{chain.chain_type}" chain gene name "{v_gene}"') if j_gene: j_chains = [chain for chain in j_chains if chain.name.startswith(j_gene)] if not j_chains: print('Available J genes:', get_imgt_j_chains(chain.chain_type).keys()) raise ValueError(f'No J genes found for "{chain.chain_type}" chain gene name "{j_gene}"') if unique: v_chains = _get_unique_chains(v_chains) j_chains = _get_unique_chains(j_chains) v_alignments = [chain.align(germline) for germline in v_chains] v_ranks = np.array([alignment.num_mutations() for alignment in v_alignments]).argsort(kind='stable')[:limit] top_v_chains = [v_chains[r] for r in v_ranks] j_alignments = [chain.align(germline) for germline in j_chains] j_ranks = np.array([alignment.num_mutations() for alignment in j_alignments]).argsort(kind='stable')[:limit] top_j_chains = [j_chains[r] for r in j_ranks] return top_v_chains, top_j_chains def find_merged_human_germline(self, top=0, v_gene=None, j_gene=None) -> 'Chain': """Find n-th most identical V and J germline sequence based on IMGT alignment and merge them into one Chain :param top: Return top N most identical germline (0-indexed) :param v_gene: Filter germlines to specific V gene name :param j_gene: Filter germlines to specific J gene name :return: merged germline sequence Chain object """ v_chains, j_chains = self.find_human_germlines(limit=top+1, v_gene=v_gene, j_gene=j_gene) v_chain = v_chains[top] j_chain = j_chains[top] merged_dict = { **{pos: aa for pos, aa in j_chain}, **{pos: aa for pos, aa in v_chain} } return Chain( sequence=None, aa_dict=merged_dict, chain_type=self.chain_type, scheme='imgt', tail='' ) @property def raw(self): """Access raw representation of this chain to allow unaligned numeric indexing and slicing >>> # String numbering is based on schema numbering >>> chain['1'] 'QVQLQQSGAE' >>> # Numbering of ``chain.raw`` starts at 0 >>> chain.raw[0] 'QVQLQQSGAE' >>> # Slicing with string is based on schema numbering, the end is inclusive >>> chain['1':'10'] 'QVQLQQSGAE' >>> # Slicing with ``chain.raw`` starts at 0, the end is exclusive (Python style) >>> chain.raw[0:10] 'QVQLQQSGAE' :return: Raw chain accessor that can be sliced or indexed to produce a new :class:`Chain` object """ return RawChainAccessor(self) @property def regions(self): """Dictionary of region dictionaries Region is an uppercase string, one of: ``"FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"`` :return: Dictionary of Region name -> Dictionary of (:class:`Position` -> Amino acid) """ return OrderedDict( FR1=self.fr1_dict, CDR1=self.cdr1_dict, FR2=self.fr2_dict, CDR2=self.cdr2_dict, FR3=self.fr3_dict, CDR3=self.cdr3_dict, FR4=self.fr4_dict ) @property def positions(self): """Dictionary of :class:`Position` -> Amino acid""" positions = OrderedDict() for region, aa_dict in self.regions.items(): for pos, aa in aa_dict.items(): positions[pos] = aa return positions @property def seq(self): """Unaligned string representation of the variable chain sequence :return: Unaligned string representation of the variable chain sequence """ return ''.join(self.positions.values()) @property def fr1_seq(self): """Unaligned string representation of the Framework 1 region sequence""" return ''.join(self.fr1_dict.values()) @property def cdr1_seq(self): """Unaligned string representation of the CDR 1 region sequence""" return ''.join(self.cdr1_dict.values()) @property def fr2_seq(self): """Unaligned string representation of the Framework 2 region sequence""" return ''.join(self.fr2_dict.values()) @property def cdr2_seq(self): """Unaligned string representation of the CDR 2 region sequence""" return ''.join(self.cdr2_dict.values()) @property def fr3_seq(self): """Unaligned string representation of the Framework 3 region sequence""" return ''.join(self.fr3_dict.values()) @property def cdr3_seq(self): """Unaligned string representation of the CDR 3 region sequence""" return ''.join(self.cdr3_dict.values()) @property def fr4_seq(self): """Unaligned string representation of the Framework 4 region sequence""" return ''.join(self.fr4_dict.values()) class RawChainAccessor: def __init__(self, chain: Chain): self.chain = chain def __getitem__(self, item): if isinstance(item, slice): if item.step is not None and item.step != 1: raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}') if item.start is not None and not is_integer(item.start): raise IndexError(f'Expected int start index for chain.raw, got {type(item.start)}: {item.start}') if item.stop is not None and not is_integer(item.stop): raise IndexError(f'Expected int end index for chain.raw, got {type(item.stop)}: {item.stop}') return self.chain.slice(start=item.start, stop=item.stop, stop_inclusive=False, allow_raw=True) if not is_integer(item): raise IndexError(f'Expected int indexing for chain.raw, got {type(item)}: {item}') pos = self.chain.get_position_by_raw_index(item) return self.chain[pos]