monsoon-nlp
commited on
Commit
•
8badb26
1
Parent(s):
4e7da13
uppercase nucleotides for biotokens
Browse files
README.md
CHANGED
@@ -54,7 +54,7 @@ tokenizer = AutoTokenizer.from_pretrained("monsoon-nlp/llama3-biotokenpretrain-k
|
|
54 |
tokenizer.pad_token = tokenizer.eos_token # pad fix
|
55 |
|
56 |
qed = "∎" # from math symbols, used in pretraining
|
57 |
-
sequence = "".join([(qed + nt) for nt in "GCCTATAGTGTGTAGCTAATGAGCCTAGGTTATCGACCCTAATCT"])
|
58 |
|
59 |
inputs = tokenizer(f"{prefix}{sequence}{annotation}", return_tensors="pt")
|
60 |
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)
|
|
|
54 |
tokenizer.pad_token = tokenizer.eos_token # pad fix
|
55 |
|
56 |
qed = "∎" # from math symbols, used in pretraining
|
57 |
+
sequence = "".join([(qed + nt.upper()) for nt in "GCCTATAGTGTGTAGCTAATGAGCCTAGGTTATCGACCCTAATCT"])
|
58 |
|
59 |
inputs = tokenizer(f"{prefix}{sequence}{annotation}", return_tensors="pt")
|
60 |
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)
|