monsoon-nlp
commited on
tokenizer fix
Browse files
README.md
CHANGED
@@ -43,7 +43,7 @@ Information about location in the kaniwa chromosome: >lcl|Cp5
|
|
43 |
|
44 |
## Usage
|
45 |
|
46 |
-
###
|
47 |
|
48 |
```python
|
49 |
from peft import AutoPeftModelForCausalLM
|
@@ -64,16 +64,18 @@ sample = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]
|
|
64 |
### LoRA finetuning on a new task
|
65 |
|
66 |
```python
|
|
|
67 |
from trl import SFTTrainer
|
68 |
from unsloth import FastLanguageModel
|
69 |
|
70 |
-
model,
|
71 |
model_name = "monsoon-nlp/llama3-biotokenpretrain-kaniwa",
|
72 |
max_seq_length = 7_000, # max 6,000 bp for AgroNT tasks
|
73 |
dtype = None,
|
74 |
load_in_4bit = True,
|
75 |
resize_model_vocab=128260, # includes biotokens
|
76 |
)
|
|
|
77 |
tokenizer.pad_token = tokenizer.eos_token # pad fix
|
78 |
|
79 |
trainer = SFTTrainer(
|
|
|
43 |
|
44 |
## Usage
|
45 |
|
46 |
+
### Inference with DNA sequence
|
47 |
|
48 |
```python
|
49 |
from peft import AutoPeftModelForCausalLM
|
|
|
64 |
### LoRA finetuning on a new task
|
65 |
|
66 |
```python
|
67 |
+
from transformers import AutoTokenizer
|
68 |
from trl import SFTTrainer
|
69 |
from unsloth import FastLanguageModel
|
70 |
|
71 |
+
model, _ = FastLanguageModel.from_pretrained(
|
72 |
model_name = "monsoon-nlp/llama3-biotokenpretrain-kaniwa",
|
73 |
max_seq_length = 7_000, # max 6,000 bp for AgroNT tasks
|
74 |
dtype = None,
|
75 |
load_in_4bit = True,
|
76 |
resize_model_vocab=128260, # includes biotokens
|
77 |
)
|
78 |
+
tokenizer = AutoTokenizer.from_pretrained("monsoon-nlp/llama3-biotokenpretrain-kaniwa")
|
79 |
tokenizer.pad_token = tokenizer.eos_token # pad fix
|
80 |
|
81 |
trainer = SFTTrainer(
|