Upload tokenizer
Browse files- model.py +116 -0
- tokenizer_config.json +6 -0
model.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
3 |
+
|
4 |
+
from transformers import (
|
5 |
+
LlamaConfig, LlamaForSequenceClassification, LlamaForCausalLM,
|
6 |
+
GPT2Config, GPT2ForSequenceClassification, GPT2LMHeadModel,
|
7 |
+
PreTrainedTokenizerFast
|
8 |
+
)
|
9 |
+
from tokenizers import Tokenizer
|
10 |
+
from tokenizers.models import BPE
|
11 |
+
|
12 |
+
from src.const import ACTION_SPACE, VOCAB
|
13 |
+
|
14 |
+
class RookTokenizer(PreTrainedTokenizerFast):
|
15 |
+
# TODO: make it easier to use checkpoints from the hub
|
16 |
+
# https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
|
17 |
+
def __call__(self, *args, **kwargs):
|
18 |
+
kwargs["return_token_type_ids"] = False
|
19 |
+
return super().__call__(*args, **kwargs)
|
20 |
+
|
21 |
+
def make_model(config_dict, arch="llama"):
|
22 |
+
if config_dict["finetuning_task"] == "text-classification":
|
23 |
+
return make_model_clf(config_dict, arch=arch)
|
24 |
+
elif config_dict["finetuning_task"] == "text-generation":
|
25 |
+
return make_model_lm(config_dict, arch=arch)
|
26 |
+
else:
|
27 |
+
raise ValueError(f"Unknown config finetuning_task: {config_dict['finetuning_task']}")
|
28 |
+
|
29 |
+
def make_model_clf(config_dict, arch):
|
30 |
+
if arch == "llama":
|
31 |
+
Config = LlamaConfig
|
32 |
+
Model = LlamaForSequenceClassification
|
33 |
+
if arch == "gpt2":
|
34 |
+
Config = GPT2Config
|
35 |
+
Model = GPT2ForSequenceClassification
|
36 |
+
|
37 |
+
# pad to multiple of 128
|
38 |
+
config_dict["vocab_size"] = ((len(VOCAB) + 127) // 128) * 128
|
39 |
+
config = Config(**config_dict)
|
40 |
+
label_to_id = {v: i for i, v in enumerate(ACTION_SPACE)}
|
41 |
+
config.num_labels = len(ACTION_SPACE)
|
42 |
+
config.label2id = label_to_id
|
43 |
+
config.id2label = {id: label for label, id in label_to_id.items()}
|
44 |
+
model = Model(config=config)
|
45 |
+
return model
|
46 |
+
|
47 |
+
def make_model_lm(config_dict, arch):
|
48 |
+
if arch == "llama":
|
49 |
+
Config = LlamaConfig
|
50 |
+
Model = LlamaForCausalLM
|
51 |
+
if arch == "gpt2":
|
52 |
+
Config = GPT2Config
|
53 |
+
Model = GPT2LMHeadModel
|
54 |
+
# pad to multiple of 128
|
55 |
+
config_dict["vocab_size"] = ((len(VOCAB) + len(ACTION_SPACE) + 4 + 127) // 128) * 128
|
56 |
+
config = Config(**config_dict)
|
57 |
+
model = Model(config=config)
|
58 |
+
return model
|
59 |
+
|
60 |
+
|
61 |
+
def make_tokenizer(task="clf"):
|
62 |
+
if task == "clf":
|
63 |
+
return make_tokenizer_clf(model_max_length=78)
|
64 |
+
elif task == "lm":
|
65 |
+
return make_tokenizer_lm(model_max_length=79)
|
66 |
+
elif task == "lm-cot":
|
67 |
+
return make_tokenizer_lm(model_max_length=116)
|
68 |
+
else:
|
69 |
+
raise ValueError(f"Unknown task: {task}")
|
70 |
+
|
71 |
+
def make_tokenizer_clf(model_max_length):
|
72 |
+
single_char_vocab = [e for e in VOCAB if len(e) == 1]
|
73 |
+
multi_char_vocab = [e for e in VOCAB if len(e) > 1]
|
74 |
+
merges = [tuple(e) for e in multi_char_vocab]
|
75 |
+
print(merges[:5])
|
76 |
+
|
77 |
+
tokenizer = Tokenizer(BPE(
|
78 |
+
vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))),
|
79 |
+
merges=merges)
|
80 |
+
)
|
81 |
+
|
82 |
+
fast_tokenizer = RookTokenizer(
|
83 |
+
tokenizer_object=tokenizer,
|
84 |
+
model_max_length=model_max_length,
|
85 |
+
pad_token="[PAD]",
|
86 |
+
cls_token="[CLS]",
|
87 |
+
sep_token="[SEP]",
|
88 |
+
mask_token="[MASK]",
|
89 |
+
clean_up_tokenization_spaces=False
|
90 |
+
)
|
91 |
+
return fast_tokenizer
|
92 |
+
|
93 |
+
def make_tokenizer_lm(model_max_length):
|
94 |
+
vocab = VOCAB + ACTION_SPACE
|
95 |
+
vocab += ["[OPTIONS]", "[VALUES]", "[ACTION]", "0000"]
|
96 |
+
|
97 |
+
single_char_vocab = [e for e in vocab if len(e) == 1]
|
98 |
+
multi_char_vocab = [e for e in vocab if len(e) > 1]
|
99 |
+
merges = []
|
100 |
+
|
101 |
+
tokenizer = Tokenizer(BPE(
|
102 |
+
vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))),
|
103 |
+
merges=merges)
|
104 |
+
)
|
105 |
+
tokenizer.add_special_tokens(multi_char_vocab)
|
106 |
+
|
107 |
+
fast_tokenizer = RookTokenizer(
|
108 |
+
tokenizer_object=tokenizer,
|
109 |
+
model_max_length=model_max_length,
|
110 |
+
pad_token="[PAD]",
|
111 |
+
cls_token="[CLS]",
|
112 |
+
sep_token="[SEP]",
|
113 |
+
mask_token="[MASK]",
|
114 |
+
clean_up_tokenization_spaces=False
|
115 |
+
)
|
116 |
+
return fast_tokenizer
|
tokenizer_config.json
CHANGED
@@ -33,6 +33,12 @@
|
|
33 |
"special": true
|
34 |
}
|
35 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
"clean_up_tokenization_spaces": false,
|
37 |
"cls_token": "[CLS]",
|
38 |
"mask_token": "[MASK]",
|
|
|
33 |
"special": true
|
34 |
}
|
35 |
},
|
36 |
+
"auto_map": {
|
37 |
+
"AutoTokenizer": [
|
38 |
+
"model.RookTokenizer",
|
39 |
+
null
|
40 |
+
]
|
41 |
+
},
|
42 |
"clean_up_tokenization_spaces": false,
|
43 |
"cls_token": "[CLS]",
|
44 |
"mask_token": "[MASK]",
|