jrahn commited on
Commit
21c1d52
·
verified ·
1 Parent(s): 05d5902

Upload tokenizer

Browse files
Files changed (2) hide show
  1. model.py +116 -0
  2. tokenizer_config.json +6 -0
model.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
3
+
4
+ from transformers import (
5
+ LlamaConfig, LlamaForSequenceClassification, LlamaForCausalLM,
6
+ GPT2Config, GPT2ForSequenceClassification, GPT2LMHeadModel,
7
+ PreTrainedTokenizerFast
8
+ )
9
+ from tokenizers import Tokenizer
10
+ from tokenizers.models import BPE
11
+
12
+ from src.const import ACTION_SPACE, VOCAB
13
+
14
+ class RookTokenizer(PreTrainedTokenizerFast):
15
+ # TODO: make it easier to use checkpoints from the hub
16
+ # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
17
+ def __call__(self, *args, **kwargs):
18
+ kwargs["return_token_type_ids"] = False
19
+ return super().__call__(*args, **kwargs)
20
+
21
+ def make_model(config_dict, arch="llama"):
22
+ if config_dict["finetuning_task"] == "text-classification":
23
+ return make_model_clf(config_dict, arch=arch)
24
+ elif config_dict["finetuning_task"] == "text-generation":
25
+ return make_model_lm(config_dict, arch=arch)
26
+ else:
27
+ raise ValueError(f"Unknown config finetuning_task: {config_dict['finetuning_task']}")
28
+
29
+ def make_model_clf(config_dict, arch):
30
+ if arch == "llama":
31
+ Config = LlamaConfig
32
+ Model = LlamaForSequenceClassification
33
+ if arch == "gpt2":
34
+ Config = GPT2Config
35
+ Model = GPT2ForSequenceClassification
36
+
37
+ # pad to multiple of 128
38
+ config_dict["vocab_size"] = ((len(VOCAB) + 127) // 128) * 128
39
+ config = Config(**config_dict)
40
+ label_to_id = {v: i for i, v in enumerate(ACTION_SPACE)}
41
+ config.num_labels = len(ACTION_SPACE)
42
+ config.label2id = label_to_id
43
+ config.id2label = {id: label for label, id in label_to_id.items()}
44
+ model = Model(config=config)
45
+ return model
46
+
47
+ def make_model_lm(config_dict, arch):
48
+ if arch == "llama":
49
+ Config = LlamaConfig
50
+ Model = LlamaForCausalLM
51
+ if arch == "gpt2":
52
+ Config = GPT2Config
53
+ Model = GPT2LMHeadModel
54
+ # pad to multiple of 128
55
+ config_dict["vocab_size"] = ((len(VOCAB) + len(ACTION_SPACE) + 4 + 127) // 128) * 128
56
+ config = Config(**config_dict)
57
+ model = Model(config=config)
58
+ return model
59
+
60
+
61
+ def make_tokenizer(task="clf"):
62
+ if task == "clf":
63
+ return make_tokenizer_clf(model_max_length=78)
64
+ elif task == "lm":
65
+ return make_tokenizer_lm(model_max_length=79)
66
+ elif task == "lm-cot":
67
+ return make_tokenizer_lm(model_max_length=116)
68
+ else:
69
+ raise ValueError(f"Unknown task: {task}")
70
+
71
+ def make_tokenizer_clf(model_max_length):
72
+ single_char_vocab = [e for e in VOCAB if len(e) == 1]
73
+ multi_char_vocab = [e for e in VOCAB if len(e) > 1]
74
+ merges = [tuple(e) for e in multi_char_vocab]
75
+ print(merges[:5])
76
+
77
+ tokenizer = Tokenizer(BPE(
78
+ vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))),
79
+ merges=merges)
80
+ )
81
+
82
+ fast_tokenizer = RookTokenizer(
83
+ tokenizer_object=tokenizer,
84
+ model_max_length=model_max_length,
85
+ pad_token="[PAD]",
86
+ cls_token="[CLS]",
87
+ sep_token="[SEP]",
88
+ mask_token="[MASK]",
89
+ clean_up_tokenization_spaces=False
90
+ )
91
+ return fast_tokenizer
92
+
93
+ def make_tokenizer_lm(model_max_length):
94
+ vocab = VOCAB + ACTION_SPACE
95
+ vocab += ["[OPTIONS]", "[VALUES]", "[ACTION]", "0000"]
96
+
97
+ single_char_vocab = [e for e in vocab if len(e) == 1]
98
+ multi_char_vocab = [e for e in vocab if len(e) > 1]
99
+ merges = []
100
+
101
+ tokenizer = Tokenizer(BPE(
102
+ vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))),
103
+ merges=merges)
104
+ )
105
+ tokenizer.add_special_tokens(multi_char_vocab)
106
+
107
+ fast_tokenizer = RookTokenizer(
108
+ tokenizer_object=tokenizer,
109
+ model_max_length=model_max_length,
110
+ pad_token="[PAD]",
111
+ cls_token="[CLS]",
112
+ sep_token="[SEP]",
113
+ mask_token="[MASK]",
114
+ clean_up_tokenization_spaces=False
115
+ )
116
+ return fast_tokenizer
tokenizer_config.json CHANGED
@@ -33,6 +33,12 @@
33
  "special": true
34
  }
35
  },
 
 
 
 
 
 
36
  "clean_up_tokenization_spaces": false,
37
  "cls_token": "[CLS]",
38
  "mask_token": "[MASK]",
 
33
  "special": true
34
  }
35
  },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "model.RookTokenizer",
39
+ null
40
+ ]
41
+ },
42
  "clean_up_tokenization_spaces": false,
43
  "cls_token": "[CLS]",
44
  "mask_token": "[MASK]",