Rostlab
/

ProstT5

@@ -104,8 +104,10 @@ model = AutoModelForSeq2SeqLM.from_pretrained("Rostlab/ProstT5").to(device)
 # only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
 model.full() if device=='cpu' else model.half()
-# prepare your protein sequences/structures as a list. Amino acid sequences are expected to be upper-case ("PRTEINO" below)
-folding_example = ["PRTEINO", "SEQWENCE"]
 min_len = min([ len(s) for s in folding_example])
 max_len = max([ len(s) for s in folding_example])
@@ -116,9 +118,12 @@ sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequen
 sequence_examples = [ "<AA2fold>" + " " + s for s in sequence_examples]
 # tokenize sequences and pad up to the longest sequence in the batch
-ids = tokenizer.batch_encode_plus(sequences_example, add_special_tokens=True, padding="longest",return_tensors='pt').to(device)
-# Generation configuration
 gen_kwargs_aa2fold = {
                   "do_sample": True,
                   "num_beams": 3,
@@ -128,11 +133,11 @@ gen_kwargs_aa2fold = {
                   "repetition_penalty" : 1.2,
 }
-# translate from AA to 3Di
 with torch.no_grad():
-  target = model.generate(
-              start_encoding.input_ids,
-              attention_mask=start_encoding.attention_mask,
               max_length=max_len, # max length of generated text
               min_length=min_len, # minimum length of the generated text
               early_stopping=True, # stop early if end-of-text token is generated
@@ -140,40 +145,22 @@ with torch.no_grad():
               **gen_kwargs_aa2fold
   )
 # Decode and remove white-spaces between tokens
-t_strings = tokenizer.batch_decode( target, skip_special_tokens=True )
-t_strings = [ "".join(ts.split(" ")) for ts in t_strings ]
-```
-Translation ("inverse folding", i.e., 3Di to AA):
-```python
-from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
-import torch
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-# Load the tokenizer
-tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False).to(device)
-# Load the model
-model = AutoModelForSeq2SeqLM.from_pretrained("Rostlab/ProstT5").to(device)
-# only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
-model.full() if device=='cpu' else model.half()
-# prepare your protein sequences/structures as a list. Amino acid sequences are expected to be upper-case ("PRTEINO" below)
-folding_example = ["prtein", "strctr"]
-min_len = min([ len(s) for s in folding_example])
-max_len = max([ len(s) for s in folding_example])
-# replace all rare/ambiguous amino acids by X (3Di sequences does not have those) and introduce white-space between all sequences (AAs and 3Di)
-sequence_examples = [" ".join(list(sequence)) for sequence in sequence_examples]
-# add pre-fixes accordingly. For the translation from 3Di to AAs, you need to prepend "<fold2AA>"
-sequence_examples = [ "<fold2AA>" + " " + s for s in sequence_examples]
 # tokenize sequences and pad up to the longest sequence in the batch
-ids = tokenizer.batch_encode_plus(sequences_example, add_special_tokens=True, padding="longest",return_tensors='pt').to(device)
-# Generation configuration
 gen_kwargs_fold2AA = {
             "do_sample": True,
             "top_p" : 0.90,
@@ -182,11 +169,11 @@ gen_kwargs_fold2AA = {
             "repetition_penalty" : 1.2,
 }
-# translate from 3Di to AA
 with torch.no_grad():
-  target = model.generate(
-              start_encoding.input_ids,
-              attention_mask=start_encoding.attention_mask,
               max_length=max_len, # max length of generated text
               min_length=min_len, # minimum length of the generated text
               early_stopping=True, # stop early if end-of-text token is generated
@@ -194,8 +181,9 @@ with torch.no_grad():
               **gen_kwargs_fold2AA
   )
 # Decode and remove white-spaces between tokens
-t_strings = tokenizer.batch_decode( target, skip_special_tokens=True )
-t_strings = [ "".join(ts.split(" ")) for ts in t_strings ]
 ```

 # only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
 model.full() if device=='cpu' else model.half()
+# prepare your protein sequences/structures as a list.
+# Amino acid sequences are expected to be upper-case ("PRTEINO" below)
+# while 3Di-sequences need to be lower-case.
+sequence_examples = ["PRTEINO", "SEQWENCE"]
 min_len = min([ len(s) for s in folding_example])
 max_len = max([ len(s) for s in folding_example])
 sequence_examples = [ "<AA2fold>" + " " + s for s in sequence_examples]
 # tokenize sequences and pad up to the longest sequence in the batch
+ids = tokenizer.batch_encode_plus(sequences_example,
+                                  add_special_tokens=True,
+                                  padding="longest",
+                                  return_tensors='pt').to(device))
+# Generation configuration for "folding" (AA-->3Di)
 gen_kwargs_aa2fold = {
                   "do_sample": True,
                   "num_beams": 3,
                   "repetition_penalty" : 1.2,
 }
+# translate from AA to 3Di (AA-->3Di)
 with torch.no_grad():
+  translations = model.generate(
+              ids.input_ids,
+              attention_mask=ids.attention_mask,
               max_length=max_len, # max length of generated text
               min_length=min_len, # minimum length of the generated text
               early_stopping=True, # stop early if end-of-text token is generated
               **gen_kwargs_aa2fold
   )
 # Decode and remove white-spaces between tokens
+decoded_translations = tokenizer.batch_decode( translations, skip_special_tokens=True )
+structure_sequences = [ "".join(ts.split(" ")) for ts in decoded_translations ] # predicted 3Di strings
+# Now we can use the same model and invert the translation logic
+# to generate an amino acid sequence from the predicted 3Di-sequence (3Di-->AA)
+# add pre-fixes accordingly. For the translation from 3Di to AA (3Di-->AA), you need to prepend "<fold2AA>"
+sequence_examples_backtranslation = [ "<fold2AA>" + " " + s for s in decoded_translations]
 # tokenize sequences and pad up to the longest sequence in the batch
+ids_backtranslation = tokenizer.batch_encode_plus(sequence_examples_backtranslation,
+                                  add_special_tokens=True,
+                                  padding="longest",
+                                  return_tensors='pt').to(device))
+# Example generation configuration for "inverse folding" (3Di-->AA)
 gen_kwargs_fold2AA = {
             "do_sample": True,
             "top_p" : 0.90,
             "repetition_penalty" : 1.2,
 }
+# translate from 3Di to AA (3Di-->AA)
 with torch.no_grad():
+  backtranslations = model.generate(
+              ids_backtranslation.input_ids,
+              attention_mask=ids_backtranslation.attention_mask,
               max_length=max_len, # max length of generated text
               min_length=min_len, # minimum length of the generated text
               early_stopping=True, # stop early if end-of-text token is generated
               **gen_kwargs_fold2AA
   )
 # Decode and remove white-spaces between tokens
+decoded_backtranslations = tokenizer.batch_decode( backtranslations, skip_special_tokens=True )
+aminoAcid_sequences = [ "".join(ts.split(" ")) for ts in decoded_backtranslations ] # predicted amino acid strings
 ```