hdallatorre commited on
Commit
805bd32
1 Parent(s): 8eb8eb4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -0
README.md CHANGED
@@ -41,6 +41,11 @@ import torch
41
  tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
42
  model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
43
 
 
 
 
 
 
44
  # Create a dummy dna sequence and tokenize it
45
  sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
46
  tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
 
41
  tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
42
  model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
43
 
44
+ # Choose the length to which the input sequences are padded. By default, the
45
+ # model max length is chosen, but feel free to decrease it as the time taken to
46
+ # obtain the embeddings increases significantly with it.
47
+ max_length = tokenizer.model_max_length
48
+
49
  # Create a dummy dna sequence and tokenize it
50
  sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
51
  tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]