t03i commited on
Commit
973be27
1 Parent(s): d604cdc

Fix code example

Browse files
Files changed (1) hide show
  1. README.md +18 -19
README.md CHANGED
@@ -1,5 +1,4 @@
1
  ---
2
- language: protein
3
  tags:
4
  - protein language model
5
  datasets:
@@ -38,29 +37,29 @@ We have also noticed that for feature extraction, its better to use the feature
38
  Here is how to use this model to extract the features of a given protein sequence in PyTorch:
39
 
40
  ```python
41
- from transformers import T5Tokenizer, T5Model
42
- import re
43
- import torch
44
 
45
- tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_uniref50', do_lower_case=False)
 
 
 
46
 
47
- model = T5Model.from_pretrained("Rostlab/prot_t5_xl_uniref50")
48
-
49
- sequences_Example = ["A E T C Z A O","S K T Z P"]
50
-
51
- sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]
52
-
53
- ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True)
54
 
55
- input_ids = torch.tensor(ids['input_ids'])
56
- attention_mask = torch.tensor(ids['attention_mask'])
 
 
 
57
 
58
- with torch.no_grad():
59
- embedding = model(input_ids=input_ids,attention_mask=attention_mask,decoder_input_ids=None)
60
 
61
- # For feature extraction we recommend to use the encoder embedding
62
- encoder_embedding = embedding[2].cpu().numpy()
63
- decoder_embedding = embedding[0].cpu().numpy()
64
  ```
65
 
66
  ## Training data
1
  ---
 
2
  tags:
3
  - protein language model
4
  datasets:
37
  Here is how to use this model to extract the features of a given protein sequence in PyTorch:
38
 
39
  ```python
40
+ sequence_examples = ["PRTEINO", "SEQWENCE"]
41
+ # this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
42
+ sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
43
 
44
+ # tokenize sequences and pad up to the longest sequence in the batch
45
+ ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
46
+ input_ids = torch.tensor(ids['input_ids']).to(device)
47
+ attention_mask = torch.tensor(ids['attention_mask']).to(device)
48
 
49
+ # generate embeddings
50
+ with torch.no_grad():
51
+ embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
 
 
 
 
52
 
53
+ # extract embeddings for the first ([0,:]) sequence in the batch while removing padded & special tokens ([0,:7])
54
+ emb_0 = embedding_repr.last_hidden_state[0,:7] # shape (7 x 1024)
55
+ print(f"Shape of per-residue embedding of first sequences: {emb_0.shape}")
56
+ # do the same for the second ([1,:]) sequence in the batch while taking into account different sequence lengths ([1,:8])
57
+ emb_1 = embedding_repr.last_hidden_state[1,:8] # shape (8 x 1024)
58
 
59
+ # if you want to derive a single representation (per-protein embedding) for the whole protein
60
+ emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
61
 
62
+ print(f"Shape of per-protein embedding of first sequences: {emb_0_per_protein.shape}")
 
 
63
  ```
64
 
65
  ## Training data