Spaces:

PascalNotin
/

Tranception_design

Build error

App Files Files Community

PascalNotin commited on Aug 25, 2022

Commit

1335bda

1 Parent(s): 6590011

Implemented first version of design app

Browse files

Files changed (13) hide show

README.md +1 -1
app.py +139 -0
requirements.txt +4 -0
tranception/__init__.py +1 -0
tranception/activations.py +114 -0
tranception/config.py +36 -0
tranception/model_pytorch.py +930 -0
tranception/outputs.py +48 -0
tranception/utils/__init__.py +1 -0
tranception/utils/dms_utils.py +30 -0
tranception/utils/msa_utils.py +361 -0
tranception/utils/scoring_utils.py +203 -0
tranception/utils/tokenizers/Basic_tokenizer +1 -0

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 title: Tranception Design
 emoji: 🐨
-colorFrom: green
 colorTo: gray
 sdk: gradio
 sdk_version: 3.1.7

 ---
 title: Tranception Design
 emoji: 🐨
+colorFrom: blue
 colorTo: gray
 sdk: gradio
 sdk_version: 3.1.7

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+import transformers
+from transformers import PreTrainedTokenizerFast
+import tranception
+import datasets
+from tranception import config, model_pytorch
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import gradio as gr
+tokenizer = PreTrainedTokenizerFast(tokenizer_file="./tranception/utils/tokenizers/Basic_tokenizer",
+                                                unk_token="[UNK]",
+                                                sep_token="[SEP]",
+                                                pad_token="[PAD]",
+                                                cls_token="[CLS]",
+                                                mask_token="[MASK]"
+                                    )
+#######################################################################################################################################
+###############################################  HELPER FUNCTIONS  ####################################################################
+#######################################################################################################################################
+AA_vocab = "ACDEFGHIKLMNPQRSTVWY"
+def create_all_single_mutants(sequence,AA_vocab=AA_vocab,mutation_range_start=None,mutation_range_end=None):
+  all_single_mutants={}
+  sequence_list=list(sequence)
+  if mutation_range_start is None: mutation_range_start=1
+  if mutation_range_end is None: mutation_range_end=len(sequence)
+  for position,current_AA in enumerate(sequence[mutation_range_start-1:mutation_range_end]):
+    for mutated_AA in AA_vocab:
+      if current_AA!=mutated_AA:
+        mutated_sequence = sequence_list.copy()
+        mutated_sequence[position] = mutated_AA
+        all_single_mutants[current_AA+str(position+1)+mutated_AA]="".join(mutated_sequence)
+  all_single_mutants = pd.DataFrame.from_dict(all_single_mutants,columns=['mutated_sequence'],orient='index')
+  all_single_mutants.reset_index(inplace=True)
+  all_single_mutants.columns = ['mutant','mutated_sequence']
+  return all_single_mutants
+def create_scoring_matrix_visual(scores,sequence,AA_vocab=AA_vocab,mutation_range_start=None,mutation_range_end=None):
+  piv=scores.pivot(index='position',columns='target_AA',values='avg_score').transpose().round(4)
+  fig, ax = plt.subplots(figsize=(len(sequence)*1.2,20))
+  scores_dict = {}
+  valid_mutant_set=set(scores.mutant)
+  if mutation_range_start is None: mutation_range_start=1
+  if mutation_range_end is None: mutation_range_start=len(sequence)
+  for target_AA in list(AA_vocab):
+    for position in range(mutation_range_start,mutation_range_end+1):
+      mutant = sequence[position-1]+str(position)+target_AA
+      if mutant in valid_mutant_set:
+        scores_dict[mutant]= float(scores.loc[scores.mutant==mutant,'avg_score'])
+      else:
+        scores_dict[mutant]=0.0
+  labels = (np.asarray(["{} \n {:.4f}".format(symb,value) for symb, value in scores_dict.items() ])).reshape(len(AA_vocab),mutation_range_end-mutation_range_start+1)
+  heat = sns.heatmap(piv,annot=labels,fmt="",cmap='RdYlGn',linewidths=0.30,vmin=np.percentile(scores.avg_score,2),vmax=np.percentile(scores.avg_score,98),\
+              cbar_kws={'label': 'Log likelihood ratio (mutant / starting sequence)'})
+  heat.figure.axes[-1].yaxis.label.set_size(20)
+  #heat.set_title("Fitness scores for all single amino acid substitutions",fontsize=30)
+  heat.set_title("Higher predicted scores (green) imply higher protein fitness",fontsize=30, pad=40)
+  heat.set_xlabel("Sequence position", fontsize = 20)
+  heat.set_ylabel("Amino Acid mutation", fontsize = 20)
+  plt.savefig('fitness_scoring_substitution_matrix.png')
+  return plt
+def suggest_mutations(scores):
+  intro_message = "The following mutations may be sensible options to improve fitness: \n\n"
+  #Best mutants
+  top_mutants=list(scores.sort_values(by=['avg_score'],ascending=False).head(5).mutant)
+  mutant_recos = "The 5 single mutants with highest predicted fitness are:\n {} \n\n".format(", ".join(top_mutants))
+  #Best positions
+  positive_scores = scores[scores.avg_score > 0]
+  positive_scores_position_avg = positive_scores.groupby(['position']).mean()
+  top_positions=list(positive_scores_position_avg.sort_values(by=['avg_score'],ascending=False).head(5).index.astype(str))
+  print(top_positions)
+  position_recos = "The 5 positions with the highest average fitness increase are:\n {}".format(", ".join(top_positions))
+  return intro_message+mutant_recos+position_recos
+def get_mutated_protein(sequence,mutant):
+  mutated_sequence = list(sequence)
+  mutated_sequence[int(mutant[1:-1])-1]=mutant[-1]
+  return ''.join(mutated_sequence)
+def score_and_create_matrix_all_singles(sequence,mutation_range_start=None,mutation_range_end=None,model_type="Small",scoring_mirror=False,batch_size_inference=20,num_workers=0,AA_vocab=AA_vocab):
+  if model_type=="Small":
+    model = tranception.model_pytorch.TranceptionLMHeadModel.from_pretrained(pretrained_model_name_or_path="PascalNotin/Tranception_Small",use_auth_token=True)
+  elif model_type=="Medium":
+    model = tranception.model_pytorch.TranceptionLMHeadModel.from_pretrained(pretrained_model_name_or_path="PascalNotin/Tranception_Medium",use_auth_token=True)
+  elif model_type=="Large":
+    model = tranception.model_pytorch.TranceptionLMHeadModel.from_pretrained(pretrained_model_name_or_path="PascalNotin/Tranception_Large",use_auth_token=True)
+  model.config.tokenizer = tokenizer
+  all_single_mutants = create_all_single_mutants(sequence,AA_vocab,mutation_range_start,mutation_range_end)
+  scores = model.score_mutants(DMS_data=all_single_mutants,
+                                    target_seq=sequence,
+                                    scoring_mirror=scoring_mirror,
+                                    batch_size_inference=batch_size_inference,
+                                    num_workers=num_workers,
+                                    indel_mode=False
+                                    )
+  scores = pd.merge(scores,all_single_mutants,on="mutated_sequence",how="left")
+  scores["position"]=scores["mutant"].map(lambda x: int(x[1:-1]))
+  scores["target_AA"] = scores["mutant"].map(lambda x: x[-1])
+  score_heatmap = create_scoring_matrix_visual(scores,sequence,AA_vocab,mutation_range_start,mutation_range_end)
+  return score_heatmap,suggest_mutations(scores)
+#######################################################################################################################################
+###############################################  GRADIO INTERFACE  ####################################################################
+#######################################################################################################################################
+title = "Interactive in silico directed evolution with Tranception"
+description = "Perform in silico directed evolution with Tranception to iteratively improve the fitness of a starting protein sequence one mutation at a time. At each step, the Tranception model computes the log likelihood ratios of all possible single amino acid substitution Vs the starting sequence, and outputs a fitness heatmap and recommandations to guide the selection of the mutation to apply. Note: The current version does not currently leverage homologs retrieval at inference time to boost fitness prediction performance."
+article = "<p style='text-align: center'><a href='https://proceedings.mlr.press/v162/notin22a.html' target='_blank'>Tranception: Protein Fitness Prediction with Autoregressive Transformers and Inference-time Retrieval</a></p>"
+examples=[
+['A4_HUMAN: MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMNVQNGKWDSDPSGTKTCIDTKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPHFVIPYRCLVGEFVSDALLVPDKCKFLHQERMDVCETHLHWHTVAKETCSEKSTNLHDYGMLLPCGIDKFRGVEFVCCPLAEESDNVDSADAEEDDSDVWWGGADTDYADGSEDKVVEVAEEEEVAEVEEEEADDDEDDEDGDEVEEEAEEPYEEATERTTSIATTTTTTTESVEEVVREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSAMSQSLLKTTQEPLARDPVKLPTTAASTPDAVDKYLETPGDENEHAHFQKAKERLEAKHRERMSQVMREWEEAERQAKNLPKADKKAVIQHFQEKVESLEQEAANERQQLVETHMARVEAMLNDRRRLALENYITALQAVPPRPRHVFNMLKKYVRAEQKDRQHTLKHFEHVRMVDPKKAAQIRSQVMTHLRVIYERMNQSLSLLYNVPAVAEEIQDEVDELLQKEQNYSDDVLANMISEPRISYGNDALMPSLTETKTTVELLPVNGEFSLDDLQPWHSFGADSVPANTENEVEPVDARPAADRGLTTRPGSGLTNIKTEEISEVKMDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHHGVVEVDAAVTPEERHLSKMQQNGYENPTYKFFEQMQN'],
+['ADRB2_HUMAN: MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSGYHVEQEKENKLLCEDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL'],
+['AMIE_PSEAE: MRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
+['P53_HUMAN: MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD']
+]
+model_size_selection = gr.Radio(label="Tranception model size", choices=["Small","Medium","Large"], value="Small")
+protein_sequence_input = gr.Textbox(lines=1, label="Input protein sequence (see below for examples; default = RL40A_YEAST)",value="MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGIIEPSLKALASKYNCDKSVCRKCYARLPPRATNCRKRKCGHTNQLRPKKKLK")
+mutation_range_start = gr.Number(label="Start of mutation range (min value = 1)",value=1,precision=0)
+mutation_range_end = gr.Number(label="End of mutation range (leave empty for full lenth)",value=10,precision=0)
+scoring_mirror = gr.Checkbox(label="Score protein from both directions (leads to more robust fitness predictions, but doubles inference time)")
+#output ==> find a way to make scroallable
+output_plot = gr.Plot(label="Fitness scores for all single amino acid substitutions in mutation range")
+output_recommendations = gr.Textbox(label="Mutation recommendations")
+gr.Interface(
+    fn=score_and_create_matrix_all_singles,
+    inputs=[protein_sequence_input,mutation_range_start,mutation_range_end,model_size_selection,scoring_mirror],
+    outputs=["plot","text"],
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+    enable_queue=True,
+    allow_flagging="never"
+).launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+transformers==4.17
+datasets==1.18.3
+biopython==1.78

tranception/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import config

tranception/activations.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import math
+import torch
+from packaging import version
+from torch import nn
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+def _gelu_python(x):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def gelu_new(x):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+if version.parse(torch.__version__) < version.parse("1.4"):
+    gelu = _gelu_python
+else:
+    gelu = nn.functional.gelu
+def gelu_fast(x):
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+def _silu_python(x):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+    return x * torch.sigmoid(x)
+if version.parse(torch.__version__) < version.parse("1.7"):
+    silu = _silu_python
+else:
+    silu = nn.functional.silu
+def _mish_python(x):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+    return x * torch.tanh(nn.functional.softplus(x))
+if version.parse(torch.__version__) < version.parse("1.9"):
+    mish = _mish_python
+else:
+    mish = nn.functional.mish
+def linear_act(x):
+    return x
+def squared_relu(x):
+    """
+    Squared ReLU variant that is fastest with Pytorch.
+    """
+    x = nn.functional.relu(x)
+    return x*x
+def squared_relu_xla(x):
+    """
+    Squared ReLU variant that is fastest with JAX.
+    """
+    x = nn.functional.relu(x)
+    return x**2
+tranception_ACT2FN = {
+    "relu": nn.functional.relu,
+    "silu": silu,
+    "swish": silu,
+    "gelu": gelu,
+    "tanh": torch.tanh,
+    "gelu_new": gelu_new,
+    "gelu_fast": gelu_fast,
+    "quick_gelu": quick_gelu,
+    "mish": mish,
+    "linear": linear_act,
+    "sigmoid": torch.sigmoid,
+    "squared_relu": squared_relu,
+    "squared_relu_xla": squared_relu_xla,
+}
+def get_activation(activation_string):
+    if activation_string in tranception_ACT2FN:
+        return tranception_ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(tranception_ACT2FN.keys())}")

tranception/config.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from transformers import GPT2Config
+class TranceptionConfig(GPT2Config):
+    """
+    Config subclass for Tranception model architecture.
+    """
+    def __init__(
+        self,
+        attention_mode="tranception",
+        position_embedding="grouped_alibi",
+        tokenizer=None,
+        retrieval_aggregation_mode=None,
+        retrieval_inference_weight=0.6,
+        MSA_filename=None,
+        MSA_weight_file_name=None,
+        MSA_start=None,
+        MSA_end=None,
+        full_protein_length=None,
+        clustal_omega_location=None,
+        scoring_window=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.model_type="tranception"
+        self.attention_mode=attention_mode
+        self.position_embedding=position_embedding
+        self.tokenizer = tokenizer
+        self.retrieval_aggregation_mode = retrieval_aggregation_mode
+        self.retrieval_inference_weight = retrieval_inference_weight
+        self.MSA_filename = MSA_filename
+        self.MSA_weight_file_name = MSA_weight_file_name
+        self.MSA_start=MSA_start
+        self.MSA_end=MSA_end
+        self.full_protein_length = full_protein_length
+        self.clustal_omega_location = clustal_omega_location
+        self.scoring_window=scoring_window

tranception/model_pytorch.py ADDED Viewed

	@@ -0,0 +1,930 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import math
+import os
+import pandas as pd
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, NLLLoss
+import torch.nn.functional as F
+from transformers import GPT2PreTrainedModel
+from transformers.modeling_utils import (
+    Conv1D,
+    PreTrainedModel,
+    SequenceSummary,
+    find_pruneable_heads_and_indices,
+    prune_conv1d_layer,
+)
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput
+)
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from tranception.activations import tranception_ACT2FN
+from tranception.config import TranceptionConfig
+from tranception.outputs import (
+    TranceptionCausalLMOutputWithCrossAttentions,
+)
+from tranception.utils import msa_utils
+from tranception.utils import scoring_utils
+def nanmean(v, *args, inplace=False, **kwargs):
+    if not inplace:
+        v = v.clone()
+    is_nan = torch.isnan(v)
+    v[is_nan] = 0
+    return v.sum(*args, **kwargs) / (~is_nan).float().sum(*args, **kwargs)
+def get_slopes(n, mode="standard_alibi", verbose=False):
+    """
+    Function to compute the m constant for each attention head. Code has been adapted from the official ALiBi codebase at:
+    https://github.com/ofirpress/attention_with_linear_biases/blob/master/fairseq/models/transformer.py
+    """
+    def get_slopes_power_of_2(n):
+        start = (2**(-2**-(math.log2(n)-3)))
+        ratio = start
+        return [start*ratio**i for i in range(n)]
+    if mode=="grouped_alibi":
+        n = n // 4
+    if math.log2(n).is_integer():
+        result = get_slopes_power_of_2(n)
+    else:
+        #Workaround when the number of heads is not a power of 2
+        closest_power_of_2 = 2**math.floor(math.log2(n))
+        result = get_slopes_power_of_2(closest_power_of_2) + get_slopes(2*closest_power_of_2)[0::2][:n-closest_power_of_2]
+    if mode=="grouped_alibi":
+        result = result * 4
+        if verbose:
+            print("ALiBi slopes: {}".format(result))
+    return result
+class SpatialDepthWiseConvolution(nn.Module):
+    def __init__(self, head_dim: int, kernel_size: int = 3):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv1d(in_channels=head_dim, out_channels=head_dim, kernel_size=(kernel_size,), padding=(kernel_size - 1,), groups=head_dim)
+    def forward(self, x: torch.Tensor):
+        batch_size, heads, seq_len, head_dim = x.shape
+        x = x.permute(0, 1, 3, 2).contiguous()
+        x = x.view(batch_size * heads, head_dim, seq_len)
+        x = self.conv(x)
+        if self.kernel_size>1:
+            x = x[:, :, :-(self.kernel_size - 1)]
+        x = x.view(batch_size, heads, head_dim, seq_len)
+        x = x.permute(0, 1, 3, 2)
+        return x
+class TranceptionBlockAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, SDWC_kernel_size=None):
+        super().__init__()
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+        self.attention_mode=config.attention_mode
+        if self.attention_mode=="tranception":
+            assert self.num_heads%4==0, "Invalid number of heads. Tranception requires the number of heads to be a multiple of 4."
+            self.num_heads_per_kernel_size = self.num_heads // 4
+            self.query_depthwiseconv = nn.ModuleDict()
+            self.key_depthwiseconv = nn.ModuleDict()
+            self.value_depthwiseconv = nn.ModuleDict()
+            for kernel_idx, kernel in enumerate([3,5,7]):
+                self.query_depthwiseconv[str(kernel_idx)] = SpatialDepthWiseConvolution(self.head_dim,kernel)
+                self.key_depthwiseconv[str(kernel_idx)]   = SpatialDepthWiseConvolution(self.head_dim,kernel)
+                self.value_depthwiseconv[str(kernel_idx)] = SpatialDepthWiseConvolution(self.head_dim,kernel)
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None, alibi_bias=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype))
+        if alibi_bias is not None:
+            attn_weights = attn_weights + alibi_bias[:,:,:attn_weights.size(-1)]
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(*new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        alibi_bias=None,
+    ):
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+        if self.attention_mode=="tranception":
+            # We do not do anything on the first self.num_heads_per_kernel_size heads (kernel =1)
+            query_list=[query[:,:self.num_heads_per_kernel_size,:,:]]
+            key_list=[key[:,:self.num_heads_per_kernel_size,:,:]]
+            value_list=[value[:,:self.num_heads_per_kernel_size,:,:]]
+            for kernel_idx in range(3):
+                query_list.append(self.query_depthwiseconv[str(kernel_idx)](query[:,(kernel_idx+1)*self.num_heads_per_kernel_size:(kernel_idx+2)*self.num_heads_per_kernel_size,:,:]))
+                key_list.append(self.key_depthwiseconv[str(kernel_idx)](key[:,(kernel_idx+1)*self.num_heads_per_kernel_size:(kernel_idx+2)*self.num_heads_per_kernel_size,:,:]))
+                value_list.append(self.value_depthwiseconv[str(kernel_idx)](value[:,(kernel_idx+1)*self.num_heads_per_kernel_size:(kernel_idx+2)*self.num_heads_per_kernel_size,:,:]))
+            query=torch.cat(query_list, dim=1)
+            key=torch.cat(key_list, dim=1)
+            value=torch.cat(value_list, dim=1)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask, alibi_bias=alibi_bias)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs  # a, present, (attentions)
+class TranceptionBlockMLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = tranception_ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class TranceptionBlock(nn.Module):
+    def __init__(self, config, SDWC_kernel_size=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = TranceptionBlockAttention(config, SDWC_kernel_size=SDWC_kernel_size)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        if config.add_cross_attention:
+            self.crossattention = TranceptionBlockAttention(config, is_cross_attention=True, SDWC_kernel_size=SDWC_kernel_size)
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = TranceptionBlockMLP(inner_dim, config)
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        alibi_bias=None,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            alibi_bias=alibi_bias,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+        return outputs  # hidden_states, present, (attentions, cross_attentions)
+class TranceptionModel(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_dim = config.hidden_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.position_embedding = config.position_embedding if hasattr(config, "position_embedding") else "learned"
+        if self.position_embedding=="learned":
+            self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+            self.alibi = None
+        elif self.position_embedding=="grouped_alibi":
+            maxpos = config.n_positions
+            attn_heads = config.n_head
+            self.slopes = torch.Tensor(get_slopes(attn_heads, mode=self.position_embedding))
+            #The softmax operation is invariant to translation, and bias functions used are always linear.
+            alibi = self.slopes.unsqueeze(1).unsqueeze(1) * torch.arange(maxpos).unsqueeze(0).unsqueeze(0).expand(attn_heads, -1, -1)
+            alibi = alibi.view(attn_heads, 1, maxpos)
+            self.register_buffer('alibi',alibi)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([TranceptionBlock(config) for _ in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.init_weights()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+    def parallelize(self, device_map=None, num_cores=None):
+        self.device_map = (
+                get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+            )
+        device_prefix="cuda:"
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else device_prefix + str(min(self.device_map.keys()))
+        self.last_device = device_prefix + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        if self.position_embedding=="learned":
+            self.wpe = self.wpe.to(self.first_device)
+        for k, v in self.device_map.items():
+            print("k,v :"+str(k)+","+str(v))
+            for block in v:
+                cuda_device = device_prefix + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        self.ln_f = self.ln_f.to(self.last_device)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        if self.position_embedding=="learned":
+            self.wpe = self.wpe.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        # GPT2Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        if self.position_embedding=="learned":
+            position_embeds = self.wpe(position_ids)
+            hidden_states = inputs_embeds + position_embeds
+        else:
+            hidden_states = inputs_embeds
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi_bias=self.alibi if hasattr(self, "alibi") else None
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
+            if self.model_parallel:
+                device_prefix="cuda:"
+                for k, v in self.device_map.items():
+                    if i == v[-1] and device_prefix + str(k) != self.last_device:
+                        hidden_states = hidden_states.to(device_prefix + str(k + 1))
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions, moe_loss]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+                last_hidden_state=hidden_states,
+                past_key_values=presents,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attentions,
+                cross_attentions=all_cross_attentions,
+            )
+class TranceptionLMHeadModel(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = TranceptionModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.config = config
+        self.init_weights()
+        self.default_model_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.retrieval_aggregation_mode = config.retrieval_aggregation_mode if hasattr(config, "retrieval_aggregation_mode") else None
+        if self.retrieval_aggregation_mode is not None:
+            print("Model leverages both autoregressive and retrieval inference")
+            self.MSA_filename = config.MSA_filename if hasattr(config, "MSA_filename") else False
+            self.MSA_folder = '/'.join(self.MSA_filename.split(os.sep)[:-1])
+            self.MSA_name = self.MSA_filename.split(os.sep)[-1]
+            self.retrieval_inference_weight_LR = config.retrieval_inference_weight if hasattr(config, "retrieval_inference_weight") else 0.6
+            self.retrieval_inference_weight_RL = config.retrieval_inference_weight if hasattr(config, "retrieval_inference_weight") else 0.6
+            self.MSA_start=config.MSA_start
+            self.MSA_end=config.MSA_end
+            self.full_protein_length = config.full_protein_length if hasattr(config, "full_protein_length") else -1
+            self.MSA_log_prior = torch.log(torch.tensor(
+                                                        msa_utils.get_msa_prior(
+                                                            MSA_data_file=self.MSA_filename,
+                                                            MSA_weight_file_name=config.MSA_weight_file_name,
+                                                            retrieval_aggregation_mode=self.retrieval_aggregation_mode,
+                                                            MSA_start=self.MSA_start,
+                                                            MSA_end=self.MSA_end,
+                                                            len_target_seq=self.full_protein_length,
+                                                            vocab=config.tokenizer.get_vocab(),
+                                                            verbose=False
+                                                        )
+                                            ).float().to(self.default_model_device))
+        else:
+            print("Model only uses autoregressive inference")
+    def parallelize(self, device_map=None, num_cores=None, num_pipelines=1):
+        self.num_pipelines=num_pipelines
+        self.device_map = (
+                get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+                if device_map is None
+                else device_map
+            )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map, num_cores=num_cores)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.model_parallel = True
+    def deparallelize(self):
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        return {
+                "input_ids": input_ids,
+                "past_key_values": past,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+                "flip": kwargs.get("flip", None),
+            }
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        flip=None,
+        start_slice=None,
+        end_slice=None,
+        mutated_sequence=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
+            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+        hidden_states = transformer_outputs[0]
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+            self.MSA_log_prior = self.MSA_log_prior.to(self.lm_head.weight.device)
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            if self.retrieval_aggregation_mode is not None:
+                batch_size = input_ids.size(0)
+                if self.retrieval_aggregation_mode=="aggregate_indel":
+                    assert batch_size==1, "Aggregate indel is only supported for batch size of 1"
+                    truncated_sequence_text = mutated_sequence[0][start_slice[0]:end_slice[0]]
+                    if len(truncated_sequence_text)!=shift_logits.shape[1]-1: # shift_logits only has one extra token compared to truncated_sequence_text (the BOS token)
+                        print("Tokenization error -- seq length: {} and shift_logits length - 1 : {}".format(len(mutated_sequence),shift_logits.shape[1]-1))
+                    MSA_log_prior, MSA_start, MSA_end = msa_utils.update_retrieved_MSA_log_prior_indel(self, self.MSA_log_prior, self.MSA_start, self.MSA_end, mutated_sequence[0])
+                elif self.retrieval_aggregation_mode=="aggregate_substitution":
+                    MSA_log_prior=self.MSA_log_prior
+                    MSA_start=self.MSA_start
+                    MSA_end=self.MSA_end
+                shift_log_probas = torch.log_softmax(shift_logits,dim=-1)
+                fused_shift_log_probas = shift_log_probas.clone()
+                if flip is None:
+                    flip = torch.zeros(batch_size).to(fused_shift_log_probas.device)
+                flip = flip > 0
+                for seq_index in range(batch_size):
+                    min_prior_slice = max(start_slice[seq_index], MSA_start)
+                    max_prior_slice = min(end_slice[seq_index], MSA_end)
+                    if max_prior_slice <= min_prior_slice:
+                        print("Non overlapping region detected: min_prior_slice {} and max_prior_slice {}".format(min_prior_slice,max_prior_slice))
+                        continue
+                    slice_prior = MSA_log_prior[min_prior_slice:max_prior_slice,:].to(fused_shift_log_probas.device)
+                    if flip[seq_index]:
+                        slice_prior = torch.flip(slice_prior,dims=(0,))
+                        min_logits_slice = max(0,end_slice[seq_index]-MSA_end)
+                        max_logits_slice = min_logits_slice + (max_prior_slice-min_prior_slice)
+                        fused_shift_log_probas[seq_index,min_logits_slice:max_logits_slice,:] = (1-self.retrieval_inference_weight_RL)*shift_log_probas[seq_index,min_logits_slice:max_logits_slice,:] + self.retrieval_inference_weight_RL*slice_prior
+                    else:
+                        min_logits_slice = max(0, MSA_start-start_slice[seq_index])
+                        max_logits_slice = min_logits_slice + (max_prior_slice-min_prior_slice)
+                        fused_shift_log_probas[seq_index,min_logits_slice:max_logits_slice,:] = (1-self.retrieval_inference_weight_LR)*shift_log_probas[seq_index,min_logits_slice:max_logits_slice,:] + self.retrieval_inference_weight_LR*slice_prior
+                if self.retrieval_aggregation_mode=="aggregate_indel":
+                    try:
+                        # If a given residue colume is an added zero-column, then we overwrite prior fusion and only predict based on the autoregressive transformer inference mode.
+                        inserted_retrieval_positions = [True if slice_prior[i].sum()==0 else False for i in range(len(slice_prior))]+[True] #Last True is for the end of sentence token
+                        fused_shift_log_probas[:,inserted_retrieval_positions,:]=shift_log_probas[:,inserted_retrieval_positions,:]
+                    except:
+                        print("Error when adding zero column(s) to account for insertion mutations.")
+                loss_fct = NLLLoss(reduction='none')
+                loss = loss_fct(input=fused_shift_log_probas.view(-1, fused_shift_log_probas.size(-1)), target=shift_labels.view(-1)).view(fused_shift_log_probas.shape[0],fused_shift_log_probas.shape[1])
+                mask = attention_mask[..., 1:].float()
+                mask[mask==0]=float('nan')
+                loss *= mask
+                loss = nanmean(loss, dim=1).mean()
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+                fused_shift_log_probas = None
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TranceptionCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+            fused_shift_log_probas=fused_shift_log_probas
+        )
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+        """
+        This function is used to re-order the :obj:`past_key_values` cache if
+        :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
+        called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+        """
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+            for layer_past in past
+        )
+    def score_mutants(self, DMS_data, target_seq=None, scoring_mirror=True, batch_size_inference=10, num_workers=10, indel_mode=False):
+        """
+        Method to score mutants in an input DMS file.
+        DMS_data: (dataframe) Dataframe containing the list of mutated sequences for scoring.
+        target_seq: (string) Full reference sequence (wild type) that is mutated in the DMS assay. If not None, returned scores are delta log likelihood wrt that sequence.
+        scoring_mirror: (bool) Whether to score mutated sequences from both directions (Left->Right and Right->Left).
+        batch_size_inference: (int) Batch size for scoring.
+        num_workers: (int) Number of workers to be used in the data loader.
+        indel_mode: (bool) Flag to be used when scoring insertions and deletions. Otherwise assumes substitutions.
+        """
+        df = DMS_data.copy()
+        if ('mutated_sequence' not in df) and (not indel_mode): df['mutated_sequence'] = df['mutant'].apply(lambda x: scoring_utils.get_mutated_sequence(target_seq, x))
+        assert ('mutated_sequence' in df), "DMS file to score does not have mutated_sequence column"
+        #if 'mutant' not in df: df['mutant'] = df['mutated_sequence'] #if mutant not in DMS file we default to mutated_sequence
+        if 'DMS_score' in df: del df['DMS_score']
+        if 'DMS_score_bin' in df: del df['DMS_score_bin']
+        if target_seq is not None:
+            df_left_to_right_slices = scoring_utils.get_sequence_slices(df, target_seq=target_seq, model_context_len = self.config.n_ctx - 2, indel_mode=indel_mode, scoring_window=self.config.scoring_window)
+        else:
+            df_left_to_right_slices = scoring_utils.get_sequence_slices(df, target_seq=list(df['mutated_sequence'])[0], model_context_len = self.config.n_ctx - 2, indel_mode=indel_mode, scoring_window='sliding')
+        print("Scoring sequences from left to right")
+        scores_L_to_R = scoring_utils.get_tranception_scores_mutated_sequences(model=self, mutated_sequence_df=df_left_to_right_slices, batch_size_inference=batch_size_inference, score_var_name='avg_score_L_to_R', target_seq=target_seq, num_workers=num_workers, indel_mode=indel_mode)
+        if scoring_mirror:
+            print("Scoring sequences from right to left")
+            df_right_to_left_slices = df_left_to_right_slices.copy()
+            df_right_to_left_slices['sliced_mutated_sequence'] = df_right_to_left_slices['sliced_mutated_sequence'].apply(lambda x: x[::-1])
+            scores_R_to_L = scoring_utils.get_tranception_scores_mutated_sequences(model=self, mutated_sequence_df=df_right_to_left_slices, batch_size_inference=batch_size_inference, score_var_name='avg_score_R_to_L', target_seq=target_seq, num_workers=num_workers, reverse=True, indel_mode=indel_mode)
+            all_scores = pd.merge(scores_L_to_R, scores_R_to_L, on='mutated_sequence', how='left', suffixes=('','_R_to_L'))
+            all_scores['avg_score'] = (all_scores['avg_score_L_to_R'] + all_scores['avg_score_R_to_L']) / 2.0
+        else:
+            all_scores = scores_L_to_R
+            all_scores['avg_score'] = all_scores['avg_score_L_to_R']
+        #By design "get_tranception_scores_mutated_sequences" drops the WT from the output. We add it back if that was one of the sequences to score in the DMS (score=0 by definition)
+        if target_seq in DMS_data.mutated_sequence.values:
+            print("LEMON")
+            if scoring_mirror:
+                wt_row = pd.DataFrame([[target_seq,0,0,0]], columns=['mutated_sequence','avg_score_L_to_R','avg_score_R_to_L','avg_score'])
+            else:
+                wt_row = pd.DataFrame([[target_seq,0,0]], columns=['mutated_sequence','avg_score_L_to_R','avg_score'])
+            all_scores = pd.concat([all_scores,wt_row], ignore_index=True)
+        return all_scores
+    def encode_batch(self, protein_sequence, sequence_name="sliced_mutated_sequence"):
+        """
+        Method to process an input AA sequence batch (protein_sequence) and return a tokenized sequence (via the tokenizer associated to the model).
+        """
+        protein_sequence[sequence_name] = scoring_utils.sequence_replace(sequences=protein_sequence[sequence_name], char_to_replace='X', char_replacements='ACDEFGHIKLMNPQRSTVWY')
+        protein_sequence[sequence_name] = scoring_utils.sequence_replace(sequences=protein_sequence[sequence_name], char_to_replace='B', char_replacements='DN')
+        protein_sequence[sequence_name] = scoring_utils.sequence_replace(sequences=protein_sequence[sequence_name], char_to_replace='J', char_replacements='IL')
+        protein_sequence[sequence_name] = scoring_utils.sequence_replace(sequences=protein_sequence[sequence_name], char_to_replace='Z', char_replacements='EQ')
+        return self.config.tokenizer(list(protein_sequence[sequence_name]), add_special_tokens=True, truncation=True, padding=True, max_length=self.config.n_ctx)

tranception/outputs.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from transformers.file_utils import ModelOutput
+@dataclass
+class TranceptionCausalLMOutputWithCrossAttentions(ModelOutput):
+    """
+    Class for Tranception causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Cross attentions weights after the attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
+            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
+            setting. Only relevant if `config.is_decoder = True`.
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        fused_shift_log_probas (`torch.FloatTensor` of shape (batch_size, sequence_length, config.vocab_size), *optional*, returned when config.retrieval_aggregation_mode is not None.
+            log_probas for each residue position after aggregating autoregressive logits and retrieval logits.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    fused_shift_log_probas: Optional[torch.FloatTensor] = None

tranception/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import scoring_utils, msa_utils

tranception/utils/dms_utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pandas as pd
+import numpy as np
+from tranception.utils import scoring_utils
+def DMS_file_cleanup(DMS_filename, target_seq, start_idx=1, end_idx=None, DMS_mutant_column='mutant', DMS_phenotype_name='score', DMS_directionality=1, AA_vocab = "ACDEFGHIKLMNPQRSTVWY"):
+    """
+    Function to process the raw substitution DMS assay data (eg., removing invalid mutants, aggregate silent mutations).
+    """
+    DMS_data = pd.read_csv(DMS_filename, low_memory=False)
+    end_idx = start_idx + len(target_seq) - 1 if end_idx is None else end_idx
+    DMS_data['mutant'] = DMS_data[DMS_mutant_column]
+    DMS_data=DMS_data[DMS_data['mutant'].notnull()].copy()
+    DMS_data=DMS_data[DMS_data['mutant'].apply(lambda x: all([len(y)>=3 for y in x.split(":")]))].copy() #Mutant triplets should have at least 3 or more characters
+    DMS_data=DMS_data[DMS_data['mutant'].apply(lambda x: all([(y[0] in AA_vocab) and (y[1:-1].isnumeric()) and (y[-1] in AA_vocab) for y in x.split(":")]))].copy()
+    DMS_data=DMS_data[DMS_data['mutant'].apply(lambda x: all([int(y[1:-1])-start_idx >=0 and int(y[1:-1]) <= end_idx for y in x.split(":")]))].copy()
+    DMS_data=DMS_data[DMS_data['mutant'].apply(lambda x: all([y[0]==target_seq[int(y[1:-1])-start_idx] for y in x.split(":")]))].copy()
+    DMS_data[DMS_phenotype_name]=pd.to_numeric(DMS_data[DMS_phenotype_name],errors='coerce')
+    DMS_data=DMS_data[np.isfinite(DMS_data[DMS_phenotype_name])]
+    DMS_data.dropna(subset = [DMS_phenotype_name], inplace=True)
+    DMS_data['DMS_score'] = DMS_data[DMS_phenotype_name] * DMS_directionality
+    DMS_data=DMS_data[['mutant','DMS_score']]
+    DMS_data=DMS_data.groupby('mutant').mean().reset_index()
+    DMS_data['mutated_sequence'] = DMS_data['mutant'].apply(lambda x: scoring_utils.get_mutated_sequence(target_seq, x))
+    DMS_data=DMS_data[['mutant','mutated_sequence','DMS_score']]
+    return DMS_data

tranception/utils/msa_utils.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import numpy as np
+import pandas as pd
+from collections import defaultdict
+import random
+import os
+import torch
+from Bio.Align.Applications import ClustalOmegaCommandline
+def filter_msa(msa_data, num_sequences_kept=3):
+    """
+    Helper function to filter an input MSA msa_data (obtained via process_msa_data) and keep only num_sequences_kept aligned sequences.
+    If the MSA already has fewer sequences than num_sequences_kept, we keep the MSA as is.
+    If filtering, we always keep the first sequence of the MSA (ie. the wild type) by default.
+    Sampling is done without replacement.
+    """
+    if len(list(msa_data.keys())) <= num_sequences_kept:
+        return  msa_data
+    filtered_msa = {}
+    wt_name = next(iter(msa_data))
+    filtered_msa[wt_name] = msa_data[wt_name]
+    del msa_data[wt_name]
+    sequence_names = list(msa_data.keys())
+    sequence_names_sampled  = random.sample(sequence_names,k=num_sequences_kept-1)
+    for seq in sequence_names_sampled:
+        filtered_msa[seq] = msa_data[seq]
+    return filtered_msa
+def process_msa_data(MSA_data_file):
+    """
+    Helper function that takes as input a path to a MSA file (expects a2m format) and returns a dict mapping sequence ID to the corresponding AA sequence.
+    """
+    msa_data = defaultdict(str)
+    sequence_name = ""
+    with open(MSA_data_file, "r") as msa_file:
+        for i, line in enumerate(msa_file):
+            line = line.rstrip()
+            if line.startswith(">"):
+                sequence_name = line
+            else:
+                msa_data[sequence_name] += line.upper()
+    return msa_data
+def get_one_hot_sequences_dict(msa_data,MSA_start,MSA_end,vocab):
+    vocab_size = len(vocab.keys())
+    num_sequences_msa = len(msa_data.keys())
+    one_hots = np.zeros((num_sequences_msa,MSA_end-MSA_start,vocab_size))
+    for i,seq_name in enumerate(msa_data.keys()):
+        sequence = msa_data[seq_name]
+        for j,letter in enumerate(sequence):
+            if letter in vocab:
+                k = vocab[letter]
+                one_hots[i,j,k] = 1.0
+    return one_hots
+def one_hot(sequence_string,vocab):
+    one_hots = np.zeros((len(sequence_string),len(vocab.keys())))
+    for j,letter in enumerate(sequence_string):
+        if letter in vocab:
+            k = vocab[letter]
+            one_hots[j,k] = 1.0
+    return one_hots.flatten()
+def get_msa_prior(MSA_data_file, MSA_weight_file_name, MSA_start, MSA_end, len_target_seq, vocab, retrieval_aggregation_mode="aggregate_substitution", filter_MSA=True, verbose=False):
+    """
+    Function to enable retrieval inference mode, via computation of (weighted) pseudocounts of AAs at each position of the retrieved MSA.
+    MSA_data_file: (string) path to MSA file (expects a2m format).
+    MSA_weight_file_name: (string) path to sequence weights in MSA.
+    MSA_start: (int) Sequence position that the MSA starts at (1-indexing).
+    MSA_end: (int) Sequence position that the MSA ends at (1-indexing).
+    len_target_seq: (int) Full length of sequence to be scored.
+    vocab: (dict) Vocabulary of the tokenizer.
+    retrieval_aggregation_mode: (string) Mode for retrieval inference (aggregate_substitution Vs aggregate_indel). If None, places a uniform prior over each token.
+    filter_MSA: (bool) Whether to filter out sequences with very low hamming similarity (< 0.2) to the reference sequence in the MSA (first sequence).
+    verbose: (bool) Whether to print to the console processing details along the way.
+    """
+    msa_data = process_msa_data(MSA_data_file)
+    vocab_size = len(vocab.keys())
+    if verbose: print("Target seq len is {}, MSA length is {}, start position is {}, end position is {} and vocab size is {}".format(len_target_seq,MSA_end-MSA_start,MSA_start,MSA_end,vocab_size))
+    if filter_MSA:
+        if verbose: print("Num sequences in MSA pre filtering: {}".format(len(msa_data.keys())))
+        list_sequence_names = list(msa_data.keys())
+        focus_sequence_name = list(msa_data.keys())[0]
+        ref_sequence_hot = one_hot(msa_data[focus_sequence_name],vocab)
+        for sequence_name in list_sequence_names:
+            seq_hot = one_hot(msa_data[sequence_name],vocab)
+            hamming_similarity_seq_ref = np.dot(ref_sequence_hot,seq_hot) / np.dot(ref_sequence_hot,ref_sequence_hot)
+            if hamming_similarity_seq_ref < 0.2:
+                del msa_data[sequence_name]
+        if verbose: print("Num sequences in MSA post filtering: {}".format(len(msa_data.keys())))
+    if MSA_weight_file_name is not None:
+        if verbose: print("Using weights in {} for sequences in MSA.".format(MSA_weight_file_name))
+        assert os.path.exists(MSA_weight_file_name), "Weights file not located on disk."
+        MSA_EVE = MSA_processing(
+                MSA_location=MSA_data_file,
+                use_weights=True,
+                weights_location=MSA_weight_file_name
+        )
+        #We scan through all sequences to see if we have a weight for them as per EVE pre-processing. We drop them otherwise.
+        dropped_sequences=0
+        list_sequence_names = list(msa_data.keys())
+        MSA_weight=[]
+        for sequence_name in list_sequence_names:
+            if sequence_name not in MSA_EVE.seq_name_to_sequence:
+                dropped_sequences +=1
+                del msa_data[sequence_name]
+            else:
+                MSA_weight.append(MSA_EVE.seq_name_to_weight[sequence_name])
+        if verbose: print("Dropped {} sequences from MSA due to absent sequence weights".format(dropped_sequences))
+    else:
+        MSA_weight = [1] *  len(list(msa_data.keys()))
+    if retrieval_aggregation_mode=="aggregate_substitution" or retrieval_aggregation_mode=="aggregate_indel":
+        one_hots = get_one_hot_sequences_dict(msa_data,MSA_start,MSA_end,vocab)
+        MSA_weight = np.expand_dims(np.array(MSA_weight),axis=(1,2))
+        base_rate = 1e-5
+        base_rates = np.ones_like(one_hots) * base_rate
+        weighted_one_hots = (one_hots + base_rates) * MSA_weight
+        MSA_weight_norm_counts = weighted_one_hots.sum(axis=-1).sum(axis=0)
+        MSA_weight_norm_counts = np.tile(MSA_weight_norm_counts.reshape(-1,1), (1,vocab_size))
+        one_hots_avg = weighted_one_hots.sum(axis=0) / MSA_weight_norm_counts
+        msa_prior = np.zeros((len_target_seq,vocab_size))
+        msa_prior[MSA_start:MSA_end,:]=one_hots_avg
+    else:
+        msa_prior = np.ones((len_target_seq,vocab_size)) / vocab_size
+    if verbose:
+        for idx, position in enumerate(msa_prior):
+            if len(position)!=25:
+                print("Size error")
+            if not round(position.sum(),2)==1.0:
+                print("Position at index {} does not add up to 1: {}".format(idx, position.sum()))
+    return msa_prior
+def update_retrieved_MSA_log_prior_indel(model, MSA_log_prior, MSA_start, MSA_end, mutated_sequence):
+    """
+    Function to process MSA when scoring indels.
+    To identify positions to add / remove in the retrieved MSA, we append and align the sequence to be scored to the original MSA for that protein family with Clustal Omega.
+    If the original MSA is relatively deep (over 100k sequences), we sample (by default) 100k rows at random from that MSA to speed computations.
+    MSA sampling is performed only once (for the first sequence to be scored). Subsequent scoring use the same MSA sample.
+    """
+    if not os.path.isdir(model.MSA_folder + os.sep + "Sampled"):
+        os.mkdir(model.MSA_folder + os.sep + "Sampled")
+    sampled_MSA_location = model.MSA_folder + os.sep + "Sampled" + os.sep + "Sampled_" + model.MSA_filename.split(os.sep)[-1]
+    if not os.path.exists(sampled_MSA_location):
+        msa_data = process_msa_data(model.MSA_filename)
+        msa_data_sampled = filter_msa(msa_data, num_sequences_kept=100000) #If MSA has less than 100k sequences, the sample is identical to original MSA
+        with open(sampled_MSA_location, 'w') as sampled_write_location:
+            for index, key in enumerate(msa_data_sampled):
+                key_name = ">REFERENCE_SEQUENCE" if index==0 else key
+                msa_data_sampled[key] = msa_data_sampled[key].upper()
+                msa_data_sampled[key] = msa_data_sampled[key].replace(".","-")
+                sampled_write_location.write(key_name+"\n"+"\n".join([msa_data_sampled[key][i:i+80] for i in range(0, len(msa_data_sampled[key]), 80)])+"\n")
+    seq_to_align_location = model.MSA_folder + os.sep + "Sampled" + os.sep + "Seq_to_align_" + model.MSA_filename.split(os.sep)[-1]
+    sequence_text_split = [mutated_sequence[i:i+80] for i in range(0, len(mutated_sequence), 80)]
+    sequence_text_split_split_join = "\n".join([">SEQ_TO_SCORE"]+sequence_text_split)
+    os.system("echo '"+sequence_text_split_split_join+"' > "+seq_to_align_location)
+    expanded_MSA_location = model.MSA_folder + os.sep + "Sampled" + os.sep + "Expanded_" + model.MSA_filename.split(os.sep)[-1]
+    clustalw_cline = ClustalOmegaCommandline(cmd=model.config.clustal_omega_location,
+                                            profile1=sampled_MSA_location,
+                                            profile2=seq_to_align_location,
+                                            outfile=expanded_MSA_location,
+                                            force=True)
+    stdout, stderr = clustalw_cline()
+    msa_data = process_msa_data(expanded_MSA_location)
+    aligned_seqA, aligned_seqB = msa_data[">SEQ_TO_SCORE"], msa_data[">REFERENCE_SEQUENCE"]
+    try:
+        keep_column=[]
+        for column_index_pairwise_alignment in range(len(aligned_seqA)):
+            if aligned_seqA[column_index_pairwise_alignment]=="-" and aligned_seqB[column_index_pairwise_alignment]=="-":
+                continue
+            elif aligned_seqA[column_index_pairwise_alignment]=="-":
+                keep_column.append(False)
+            elif aligned_seqB[column_index_pairwise_alignment]=="-":
+                MSA_log_prior=torch.cat((MSA_log_prior[:column_index_pairwise_alignment], torch.zeros(MSA_log_prior.shape[1]).view(1,-1).cuda(), MSA_log_prior[column_index_pairwise_alignment:]),dim=0)
+                keep_column.append(True) #keep the zero column we just added
+            else:
+                keep_column.append(True)
+        MSA_log_prior = MSA_log_prior[keep_column]
+        MSA_end = MSA_start + len(MSA_log_prior)
+    except:
+        print("Error when processing the following alignment: {}".format(expanded_MSA_location))
+    return MSA_log_prior, MSA_start, MSA_end
+class MSA_processing:
+    def __init__(self,
+        MSA_location="",
+        theta=0.2,
+        use_weights=True,
+        weights_location="./data/weights",
+        preprocess_MSA=True,
+        threshold_sequence_frac_gaps=0.5,
+        threshold_focus_cols_frac_gaps=0.3,
+        remove_sequences_with_indeterminate_AA_in_focus_cols=True
+        ):
+        """
+        This MSA_processing class is directly borrowed from the EVE codebase: https://github.com/OATML-Markslab/EVE
+        Parameters:
+        - msa_location: (path) Location of the MSA data. Constraints on input MSA format:
+            - focus_sequence is the first one in the MSA data
+            - first line is structured as follows: ">focus_seq_name/start_pos-end_pos" (e.g., >SPIKE_SARS2/310-550)
+            - corespondding sequence data located on following line(s)
+            - then all other sequences follow with ">name" on first line, corresponding data on subsequent lines
+        - theta: (float) Sequence weighting hyperparameter. Generally: Prokaryotic and eukaryotic families =  0.2; Viruses = 0.01
+        - use_weights: (bool) If False, sets all sequence weights to 1. If True, checks weights_location -- if non empty uses that;
+            otherwise compute weights from scratch and store them at weights_location
+        - weights_location: (path) Location to load from/save to the sequence weights
+        - preprocess_MSA: (bool) performs pre-processing of MSA to remove short fragments and positions that are not well covered.
+        - threshold_sequence_frac_gaps: (float, between 0 and 1) Threshold value to define fragments
+            - sequences with a fraction of gap characters above threshold_sequence_frac_gaps are removed
+            - default is set to 0.5 (i.e., fragments with 50% or more gaps are removed)
+        - threshold_focus_cols_frac_gaps: (float, between 0 and 1) Threshold value to define focus columns
+            - positions with a fraction of gap characters above threshold_focus_cols_pct_gaps will be set to lower case (and not included in the focus_cols)
+            - default is set to 0.3 (i.e., focus positions are the ones with 30% of gaps or less, i.e., 70% or more residue occupancy)
+        - remove_sequences_with_indeterminate_AA_in_focus_cols: (bool) Remove all sequences that have indeterminate AA (e.g., B, J, X, Z) at focus positions of the wild type
+        """
+        np.random.seed(2021)
+        self.MSA_location = MSA_location
+        self.weights_location = weights_location
+        self.theta = theta
+        self.alphabet = "ACDEFGHIKLMNPQRSTVWY"
+        self.use_weights = use_weights
+        self.preprocess_MSA = preprocess_MSA
+        self.threshold_sequence_frac_gaps = threshold_sequence_frac_gaps
+        self.threshold_focus_cols_frac_gaps = threshold_focus_cols_frac_gaps
+        self.remove_sequences_with_indeterminate_AA_in_focus_cols = remove_sequences_with_indeterminate_AA_in_focus_cols
+        self.gen_alignment()
+    def gen_alignment(self, verbose=False):
+        """ Read training alignment and store basics in class instance """
+        self.aa_dict = {}
+        for i,aa in enumerate(self.alphabet):
+            self.aa_dict[aa] = i
+        self.seq_name_to_sequence = defaultdict(str)
+        name = ""
+        with open(self.MSA_location, "r") as msa_data:
+            for i, line in enumerate(msa_data):
+                line = line.rstrip()
+                if line.startswith(">"):
+                    name = line
+                    if i==0:
+                        self.focus_seq_name = name
+                else:
+                    self.seq_name_to_sequence[name] += line
+        ## MSA pre-processing to remove inadequate columns and sequences
+        if self.preprocess_MSA:
+            msa_df = pd.DataFrame.from_dict(self.seq_name_to_sequence, orient='index', columns=['sequence'])
+            # Data clean up
+            msa_df.sequence = msa_df.sequence.apply(lambda x: x.replace(".","-")).apply(lambda x: ''.join([aa.upper() for aa in x]))
+            # Remove columns that would be gaps in the wild type
+            non_gap_wt_cols = [aa!='-' for aa in msa_df.sequence[self.focus_seq_name]]
+            msa_df['sequence'] = msa_df['sequence'].apply(lambda x: ''.join([aa for aa,non_gap_ind in zip(x, non_gap_wt_cols) if non_gap_ind]))
+            assert 0.0 <= self.threshold_sequence_frac_gaps <= 1.0,"Invalid fragment filtering parameter"
+            assert 0.0 <= self.threshold_focus_cols_frac_gaps <= 1.0,"Invalid focus position filtering parameter"
+            msa_array = np.array([list(seq) for seq in msa_df.sequence])
+            gaps_array = np.array(list(map(lambda seq: [aa=='-' for aa in seq], msa_array)))
+            # Identify fragments with too many gaps
+            seq_gaps_frac = gaps_array.mean(axis=1)
+            seq_below_threshold = seq_gaps_frac <= self.threshold_sequence_frac_gaps
+            if verbose: print("Proportion of sequences dropped due to fraction of gaps: "+str(round(float(1 - seq_below_threshold.sum()/seq_below_threshold.shape)*100,2))+"%")
+            # Identify focus columns
+            columns_gaps_frac = gaps_array[seq_below_threshold].mean(axis=0)
+            index_cols_below_threshold = columns_gaps_frac <= self.threshold_focus_cols_frac_gaps
+            if verbose: print("Proportion of non-focus columns removed: "+str(round(float(1 - index_cols_below_threshold.sum()/index_cols_below_threshold.shape)*100,2))+"%")
+            # Lower case non focus cols and filter fragment sequences
+            msa_df['sequence'] = msa_df['sequence'].apply(lambda x: ''.join([aa.upper() if upper_case_ind else aa.lower() for aa, upper_case_ind in zip(x, index_cols_below_threshold)]))
+            msa_df = msa_df[seq_below_threshold]
+            # Overwrite seq_name_to_sequence with clean version
+            self.seq_name_to_sequence = defaultdict(str)
+            for seq_idx in range(len(msa_df['sequence'])):
+                self.seq_name_to_sequence[msa_df.index[seq_idx]] = msa_df.sequence[seq_idx]
+        self.focus_seq = self.seq_name_to_sequence[self.focus_seq_name]
+        self.focus_cols = [ix for ix, s in enumerate(self.focus_seq) if s == s.upper() and s!='-']
+        self.focus_seq_trimmed = [self.focus_seq[ix] for ix in self.focus_cols]
+        self.seq_len = len(self.focus_cols)
+        self.alphabet_size = len(self.alphabet)
+        # Connect local sequence index with uniprot index (index shift inferred from 1st row of MSA)
+        focus_loc = self.focus_seq_name.split("/")[-1]
+        start,stop = focus_loc.split("-")
+        self.focus_start_loc = int(start)
+        self.focus_stop_loc = int(stop)
+        self.uniprot_focus_col_to_wt_aa_dict \
+            = {idx_col+int(start):self.focus_seq[idx_col] for idx_col in self.focus_cols}
+        self.uniprot_focus_col_to_focus_idx \
+            = {idx_col+int(start):idx_col for idx_col in self.focus_cols}
+        # Move all letters to CAPS; keeps focus columns only
+        self.raw_seq_name_to_sequence = self.seq_name_to_sequence.copy()
+        for seq_name,sequence in self.seq_name_to_sequence.items():
+            sequence = sequence.replace(".","-")
+            self.seq_name_to_sequence[seq_name] = [sequence[ix].upper() for ix in self.focus_cols]
+        # Remove sequences that have indeterminate AA (e.g., B, J, X, Z) in the focus columns
+        if self.remove_sequences_with_indeterminate_AA_in_focus_cols:
+            alphabet_set = set(list(self.alphabet))
+            seq_names_to_remove = []
+            for seq_name,sequence in self.seq_name_to_sequence.items():
+                for letter in sequence:
+                    if letter not in alphabet_set and letter != "-":
+                        seq_names_to_remove.append(seq_name)
+                        continue
+            seq_names_to_remove = list(set(seq_names_to_remove))
+            for seq_name in seq_names_to_remove:
+                del self.seq_name_to_sequence[seq_name]
+        # Encode the sequences
+        self.one_hot_encoding = np.zeros((len(self.seq_name_to_sequence.keys()),len(self.focus_cols),len(self.alphabet)))
+        if verbose: print("One-hot encoded sequences shape:" + str(self.one_hot_encoding.shape))
+        for i,seq_name in enumerate(self.seq_name_to_sequence.keys()):
+            sequence = self.seq_name_to_sequence[seq_name]
+            for j,letter in enumerate(sequence):
+                if letter in self.aa_dict:
+                    k = self.aa_dict[letter]
+                    self.one_hot_encoding[i,j,k] = 1.0
+        if self.use_weights:
+            try:
+                self.weights = np.load(file=self.weights_location)
+                if verbose: print("Loaded sequence weights from disk")
+            except:
+                if verbose: print ("Computing sequence weights")
+                list_seq = self.one_hot_encoding
+                list_seq = list_seq.reshape((list_seq.shape[0], list_seq.shape[1] * list_seq.shape[2]))
+                def compute_weight(seq):
+                    number_non_empty_positions = np.dot(seq,seq)
+                    if number_non_empty_positions>0:
+                        denom = np.dot(list_seq,seq) / np.dot(seq,seq)
+                        denom = np.sum(denom > 1 - self.theta)
+                        return 1/denom
+                    else:
+                        return 0.0 #return 0 weight if sequence is fully empty
+                self.weights = np.array(list(map(compute_weight,list_seq)))
+                np.save(file=self.weights_location, arr=self.weights)
+        else:
+            # If not using weights, use an isotropic weight matrix
+            if verbose: print("Not weighting sequence data")
+            self.weights = np.ones(self.one_hot_encoding.shape[0])
+        self.Neff = np.sum(self.weights)
+        self.num_sequences = self.one_hot_encoding.shape[0]
+        self.seq_name_to_weight={}
+        for i,seq_name in enumerate(self.seq_name_to_sequence.keys()):
+            self.seq_name_to_weight[seq_name]=self.weights[i]
+        if verbose:
+            print ("Neff =",str(self.Neff))
+            print ("Data Shape =",self.one_hot_encoding.shape)

tranception/utils/scoring_utils.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import tqdm
+import re
+import numpy as np
+import pandas as pd
+import torch
+from torch.nn import CrossEntropyLoss, NLLLoss
+from torch.utils.data.sampler import Sampler, SequentialSampler
+from transformers import DataCollatorForLanguageModeling, PreTrainedTokenizerFast
+from datasets import Dataset
+AA_vocab = "ACDEFGHIKLMNPQRSTVWY"
+def get_mutated_sequence(focus_seq, mutant, start_idx=1, AA_vocab=AA_vocab):
+    """
+    Helper function that mutates an input sequence (focus_seq) via an input mutation triplet (substitutions only).
+    Mutation triplet are typically based on 1-indexing: start_idx is used for switching to 0-indexing.
+    """
+    mutated_seq = list(focus_seq)
+    for mutation in mutant.split(":"):
+        try:
+            from_AA, position, to_AA = mutation[0], int(mutation[1:-1]), mutation[-1]
+        except:
+            print("Issue with mutant: "+str(mutation))
+        relative_position = position - start_idx
+        assert (from_AA==focus_seq[relative_position]), "Invalid from_AA or mutant position: "+str(mutation)+" from_AA: "+str(from_AA) + " relative pos: "+str(relative_position) + " focus_seq: "+str(focus_seq)
+        assert (to_AA in AA_vocab) , "Mutant to_AA is invalid: "+str(mutation)
+        mutated_seq[relative_position] = to_AA
+    return "".join(mutated_seq)
+def nanmean(v, *args, inplace=False, **kwargs):
+    if not inplace:
+        v = v.clone()
+    is_nan = torch.isnan(v)
+    v[is_nan] = 0
+    return v.sum(*args, **kwargs) / (~is_nan).float().sum(*args, **kwargs)
+def nansum(v, *args, inplace=False, **kwargs):
+    if not inplace:
+        v = v.clone()
+    is_nan = torch.isnan(v)
+    v[is_nan] = 0
+    return v.sum(*args, **kwargs)
+def get_optimal_window(mutation_position_relative, seq_len_wo_special, model_window):
+    """
+    Helper function that selects an optimal sequence window that fits the maximum model context size.
+    If the sequence length is less than the maximum context size, the full sequence is returned.
+    """
+    half_model_window = model_window // 2
+    if seq_len_wo_special <= model_window:
+        return [0,seq_len_wo_special]
+    elif mutation_position_relative < half_model_window:
+        return [0,model_window]
+    elif mutation_position_relative >= seq_len_wo_special - half_model_window:
+        return [seq_len_wo_special - model_window, seq_len_wo_special]
+    else:
+        return [max(0,mutation_position_relative-half_model_window), min(seq_len_wo_special,mutation_position_relative+half_model_window)]
+def sequence_replace_single(sequence, char_to_replace, char_replacements):
+    char_replacements = list(char_replacements)
+    positions = [m.start() for m in re.finditer(char_to_replace, sequence)]
+    replacements = np.random.choice(a=char_replacements, size=len(positions), replace=True)
+    sequence=list(sequence)
+    for idx, position in enumerate(positions):
+        sequence[position]=replacements[idx]
+    return ''.join(sequence)
+def sequence_replace(sequences, char_to_replace, char_replacements):
+    """
+    Helper function that replaces all Amino Acids passsed in via char_to_replace (as a string of AAs) with Amino Acids sampled from char_replacements (also a string of eligible AAs).
+    """
+    return [sequence_replace_single(sequence, char_to_replace, char_replacements) for sequence in sequences]
+def get_tranception_scores_mutated_sequences(model, mutated_sequence_df, batch_size_inference, score_var_name, target_seq, num_workers=10, reverse=False, indel_mode=False):
+    """
+    Helper function that takes as input a set of mutated sequences (in a pandas dataframe) and returns scores for each mutation.
+    If target_seq is not None, returns the delta log likelihood wrt that target sequence -- otherwise returns the log likelihood of the protein sequences.
+    """
+    scores = {}
+    scores['mutated_sequence']=[]
+    scores['sliced_mutated_sequence']=[]
+    scores['window_start']=[]
+    scores['window_end']=[]
+    scores['score']=[]
+    with torch.no_grad():
+        ds = Dataset.from_pandas(mutated_sequence_df)
+        ds.set_transform(model.encode_batch)
+        data_collator = DataCollatorForLanguageModeling(
+                        tokenizer=model.config.tokenizer,
+                        mlm=False)
+        sampler = SequentialSampler(ds)
+        ds_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size_inference, sampler=sampler, collate_fn=data_collator, num_workers=num_workers, pin_memory=True, drop_last=False)
+        mutant_index=0
+        for encoded_batch in tqdm.tqdm(ds_loader):
+            full_batch_length = len(encoded_batch['input_ids'])
+            mutated_sequence = np.array(mutated_sequence_df['mutated_sequence'][mutant_index:mutant_index+full_batch_length])
+            scores['mutated_sequence'] += list(mutated_sequence)
+            sliced_mutated_sequence = np.array(mutated_sequence_df['sliced_mutated_sequence'][mutant_index:mutant_index+full_batch_length])
+            scores['sliced_mutated_sequence'] += list(sliced_mutated_sequence)
+            window_start = np.array(mutated_sequence_df['window_start'][mutant_index:mutant_index+full_batch_length])
+            scores['window_start'] += list(window_start)
+            window_end = np.array(mutated_sequence_df['window_end'][mutant_index:mutant_index+full_batch_length])
+            scores['window_end'] += list(window_end)
+            for k, v in encoded_batch.items():
+                if isinstance(v, torch.Tensor):
+                    encoded_batch[k] = v.to(model.device)
+            shift_labels = encoded_batch['labels'][..., 1:].contiguous()
+            if (hasattr(model.config,"retrieval_aggregation_mode")) and (model.config.retrieval_aggregation_mode is not None):
+                if reverse:
+                    encoded_batch['flip']=torch.tensor([1]*full_batch_length)
+                encoded_batch['start_slice']=window_start
+                encoded_batch['end_slice']=window_end
+                encoded_batch['mutated_sequence'] = mutated_sequence #only mutated_sequence is flipped if the scoring_mirror branch of score_mutants. No need to flip mutated_sequence for MSA re-aligning
+                fused_shift_log_probas=model(**encoded_batch,return_dict=True).fused_shift_log_probas
+                loss_fct = NLLLoss(reduction='none')
+                loss = - loss_fct(input=fused_shift_log_probas.view(-1, fused_shift_log_probas.size(-1)), target=shift_labels.view(-1)).view(fused_shift_log_probas.shape[0],fused_shift_log_probas.shape[1])
+            else:
+                lm_logits=model(**encoded_batch,return_dict=True).logits
+                shift_logits = lm_logits[..., :-1, :].contiguous()
+                loss_fct = CrossEntropyLoss(reduction='none')
+                loss = - loss_fct(input=shift_logits.view(-1, shift_logits.size(-1)), target=shift_labels.view(-1)).view(shift_logits.shape[0],shift_logits.shape[1])
+            mask = encoded_batch['attention_mask'][..., 1:].float()
+            mask[mask==0]=float('nan')
+            loss *= mask
+            loss =  nansum(loss, dim=1)
+            scores_batch = list(loss.cpu().numpy())
+            full_batch_length = len(encoded_batch['input_ids'])
+            scores['score'] += scores_batch
+            mutant_index+=full_batch_length
+    scores = pd.DataFrame(scores)
+    if model.config.scoring_window=="sliding":
+        scores = scores[['mutated_sequence','score']].groupby('mutated_sequence').sum().reset_index() #We need to aggregate scores when using sliding mode
+    scores['score'] = scores['score'] / scores['mutated_sequence'].map(lambda x: len(x))
+    if target_seq is not None:
+        scores_mutated_seq = scores[scores.mutated_sequence != target_seq]
+        scores_wt = scores[scores.mutated_sequence == target_seq]
+        merge_delta = 'mutated_sequence' if model.config.scoring_window=="sliding" else 'window_start'
+        if model.config.scoring_window=="optimal":
+            delta_scores = pd.merge(scores_mutated_seq,scores_wt,how='left',on=[merge_delta],suffixes=('','_wt'))
+            delta_scores[score_var_name] = delta_scores['score'] - delta_scores['score_wt']
+        elif model.config.scoring_window=="sliding":
+            delta_scores = scores_mutated_seq.copy()
+            delta_scores[score_var_name] = delta_scores['score'] - list(scores_wt['score'])[0] # In sliding mode there is a single reference window for the WT
+        return delta_scores[['mutated_sequence',score_var_name]]
+    else:
+        scores[score_var_name] = scores['score']
+        return scores[['mutated_sequence',score_var_name]]
+def get_sequence_slices(df, target_seq, model_context_len, start_idx=1, scoring_window="optimal", indel_mode=False):
+    """
+    Helper function that takes as input a (pandas) dataframe df that contains a list of mutant triplets (substitutions) or full mutated sequences (indels) for scoring.
+    It returns a processed DMS in which sequences have been sliced to satisfy the maximum context window of the model.
+    df: (dataframe) Input dataframe to be processed
+    target_seq: (string) Full reference sequence (wild type) that is mutated in the DMS assay.
+    model_context_len: (int) Maximum context size for the model.
+    start_idx: (int) Integer to move to 0-indexing of positions (mutation triplet are typically based on 1-indexing).
+    scoring_window: (string) Method to slice sequences longer than maximum context size:
+        - optimal selects a single window as large as possible via the get_optimal_window function (this is the default)
+        - sliding splits the full sequence in contiguous (non-overlapping) chunks that are of size equal to the max context (except the last chunk which may be shorter)
+    indel_mode: (bool) Flag to be used when scoring insertions and deletions. Otherwise assumes substitutions.
+    Note: when scoring indels for sequences that would be longer than the model max context length, it is preferable to use the "sliding" scoring_window. Use "optimal" otherwise.
+    """
+    len_target_seq = len(target_seq)
+    num_mutants = len(df['mutated_sequence'])
+    df=df.reset_index(drop=True)
+    if scoring_window=="optimal":
+        df['mutation_barycenter'] = df['mutant'].apply(lambda x: int(np.array([int(mutation[1:-1]) - start_idx for mutation in x.split(':')]).mean())) if not indel_mode else df['mutated_sequence'].apply(lambda x: len(x)//2)
+        df['scoring_optimal_window'] = df['mutation_barycenter'].apply(lambda x: get_optimal_window(x, len_target_seq, model_context_len)) if not indel_mode else df['mutated_sequence'].apply(lambda x: (0,len(x)))
+        df['sliced_mutated_sequence'] = [df['mutated_sequence'][index][df['scoring_optimal_window'][index][0]:df['scoring_optimal_window'][index][1]] for index in range(num_mutants)]
+        df['window_start'] = df['scoring_optimal_window'].map(lambda x: x[0])
+        df['window_end'] = df['scoring_optimal_window'].map(lambda x: x[1])
+        del df['scoring_optimal_window'], df['mutation_barycenter']
+        if 'mutant' in df: del df['mutant']
+        df_wt=df.copy()
+        df_wt['mutated_sequence'] = [target_seq] * num_mutants
+        if indel_mode: # For indels, we set the wild type reference to be always the same (full length) sequence. We assume here that the length is lower than model context size (otherwise "Sliding" mode should be used)
+            df_wt['window_end'] = df_wt['mutated_sequence'].map(lambda x:len(x))
+        df_wt['sliced_mutated_sequence'] = [target_seq[df_wt['window_start'][index]:df_wt['window_end'][index]] for index in range(num_mutants)]
+        df = pd.concat([df,df_wt], axis=0)
+        df = df.drop_duplicates()
+    elif scoring_window=="sliding":
+        num_windows = 1 + int( len_target_seq / model_context_len)
+        df_list=[]
+        start=0
+        for window_index in range(1, num_windows+1):
+            df_sliced = df.copy()
+            df_sliced['sliced_mutated_sequence'] = df_sliced['mutated_sequence'].map(lambda x: x[start:start+model_context_len])
+            df_sliced['window_start'] = [start] * num_mutants
+            df_sliced['window_end']  =  df_sliced['mutated_sequence'].map(lambda x: min(len(x), start+model_context_len))
+            df_sliced_wt = df_sliced.copy()
+            df_sliced_wt['mutated_sequence'] = [target_seq] * num_mutants
+            df_sliced_wt['sliced_mutated_sequence'] = df_sliced_wt['mutated_sequence'].map(lambda x: x[start:start+model_context_len])
+            df_sliced_wt['window_end'] = df_sliced_wt['mutated_sequence'].map(lambda x: min(len(x), start+model_context_len)) #Need to adjust end index if WT and sequence are not same full length
+            df_list.append(df_sliced)
+            df_list.append(df_sliced_wt)
+            start += model_context_len
+        df_final = pd.concat(df_list,axis=0)
+        if 'mutant' in df_final: del df_final['mutant']
+        df = df_final.drop_duplicates()
+    return df.reset_index(drop=True)

tranception/utils/tokenizers/Basic_tokenizer ADDED Viewed

	@@ -0,0 +1 @@

+ {"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[CLS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SEP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":3,"special":true,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":4,"special":true,"content":"[MASK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":{"type":"TemplateProcessing","single":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}}],"pair":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}},{"Sequence":{"id":"B","type_id":1}},{"SpecialToken":{"id":"[SEP]","type_id":1}}],"special_tokens":{"[CLS]":{"id":"[CLS]","ids":[1],"tokens":["[CLS]"]},"[SEP]":{"id":"[SEP]","ids":[2],"tokens":["[SEP]"]}}},"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[UNK]":0,"[CLS]":1,"[SEP]":2,"[PAD]":3,"[MASK]":4,"A":5,"C":6,"D":7,"E":8,"F":9,"G":10,"H":11,"I":12,"K":13,"L":14,"M":15,"N":16,"P":17,"Q":18,"R":19,"S":20,"T":21,"V":22,"W":23,"Y":24},"merges":[]}}