monsoon-nlp commited on
Commit
8badb26
1 Parent(s): 4e7da13

uppercase nucleotides for biotokens

Browse files
Files changed (1) hide show
  1. README.md +1 -1
README.md CHANGED
@@ -54,7 +54,7 @@ tokenizer = AutoTokenizer.from_pretrained("monsoon-nlp/llama3-biotokenpretrain-k
54
  tokenizer.pad_token = tokenizer.eos_token # pad fix
55
 
56
  qed = "∎" # from math symbols, used in pretraining
57
- sequence = "".join([(qed + nt) for nt in "GCCTATAGTGTGTAGCTAATGAGCCTAGGTTATCGACCCTAATCT"])
58
 
59
  inputs = tokenizer(f"{prefix}{sequence}{annotation}", return_tensors="pt")
60
  outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)
 
54
  tokenizer.pad_token = tokenizer.eos_token # pad fix
55
 
56
  qed = "∎" # from math symbols, used in pretraining
57
+ sequence = "".join([(qed + nt.upper()) for nt in "GCCTATAGTGTGTAGCTAATGAGCCTAGGTTATCGACCCTAATCT"])
58
 
59
  inputs = tokenizer(f"{prefix}{sequence}{annotation}", return_tensors="pt")
60
  outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)