|
import os |
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" |
|
|
|
from transformers import ( |
|
LlamaConfig, LlamaForSequenceClassification, LlamaForCausalLM, |
|
GPT2Config, GPT2ForSequenceClassification, GPT2LMHeadModel, |
|
PreTrainedTokenizerFast |
|
) |
|
from tokenizers import Tokenizer |
|
from tokenizers.models import BPE |
|
|
|
from src.const import ACTION_SPACE, VOCAB |
|
|
|
class RookTokenizer(PreTrainedTokenizerFast): |
|
|
|
|
|
def __call__(self, *args, **kwargs): |
|
kwargs["return_token_type_ids"] = False |
|
return super().__call__(*args, **kwargs) |
|
|
|
def make_model(config_dict, arch="llama"): |
|
if config_dict["finetuning_task"] == "text-classification": |
|
return make_model_clf(config_dict, arch=arch) |
|
elif config_dict["finetuning_task"] == "text-generation": |
|
return make_model_lm(config_dict, arch=arch) |
|
else: |
|
raise ValueError(f"Unknown config finetuning_task: {config_dict['finetuning_task']}") |
|
|
|
def make_model_clf(config_dict, arch): |
|
if arch == "llama": |
|
Config = LlamaConfig |
|
Model = LlamaForSequenceClassification |
|
if arch == "gpt2": |
|
Config = GPT2Config |
|
Model = GPT2ForSequenceClassification |
|
|
|
|
|
config_dict["vocab_size"] = ((len(VOCAB) + 127) // 128) * 128 |
|
config = Config(**config_dict) |
|
label_to_id = {v: i for i, v in enumerate(ACTION_SPACE)} |
|
config.num_labels = len(ACTION_SPACE) |
|
config.label2id = label_to_id |
|
config.id2label = {id: label for label, id in label_to_id.items()} |
|
model = Model(config=config) |
|
return model |
|
|
|
def make_model_lm(config_dict, arch): |
|
if arch == "llama": |
|
Config = LlamaConfig |
|
Model = LlamaForCausalLM |
|
if arch == "gpt2": |
|
Config = GPT2Config |
|
Model = GPT2LMHeadModel |
|
|
|
config_dict["vocab_size"] = ((len(VOCAB) + len(ACTION_SPACE) + 4 + 127) // 128) * 128 |
|
config = Config(**config_dict) |
|
model = Model(config=config) |
|
return model |
|
|
|
|
|
def make_tokenizer(task="clf"): |
|
if task == "clf": |
|
return make_tokenizer_clf(model_max_length=78) |
|
elif task == "lm": |
|
return make_tokenizer_lm(model_max_length=79) |
|
elif task == "lm-cot": |
|
return make_tokenizer_lm(model_max_length=116) |
|
else: |
|
raise ValueError(f"Unknown task: {task}") |
|
|
|
def make_tokenizer_clf(model_max_length): |
|
single_char_vocab = [e for e in VOCAB if len(e) == 1] |
|
multi_char_vocab = [e for e in VOCAB if len(e) > 1] |
|
merges = [tuple(e) for e in multi_char_vocab] |
|
print(merges[:5]) |
|
|
|
tokenizer = Tokenizer(BPE( |
|
vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))), |
|
merges=merges) |
|
) |
|
|
|
fast_tokenizer = RookTokenizer( |
|
tokenizer_object=tokenizer, |
|
model_max_length=model_max_length, |
|
pad_token="[PAD]", |
|
cls_token="[CLS]", |
|
sep_token="[SEP]", |
|
mask_token="[MASK]", |
|
clean_up_tokenization_spaces=False |
|
) |
|
return fast_tokenizer |
|
|
|
def make_tokenizer_lm(model_max_length): |
|
vocab = VOCAB + ACTION_SPACE |
|
vocab += ["[OPTIONS]", "[VALUES]", "[ACTION]", "0000"] |
|
|
|
single_char_vocab = [e for e in vocab if len(e) == 1] |
|
multi_char_vocab = [e for e in vocab if len(e) > 1] |
|
merges = [] |
|
|
|
tokenizer = Tokenizer(BPE( |
|
vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))), |
|
merges=merges) |
|
) |
|
tokenizer.add_special_tokens(multi_char_vocab) |
|
|
|
fast_tokenizer = RookTokenizer( |
|
tokenizer_object=tokenizer, |
|
model_max_length=model_max_length, |
|
pad_token="[PAD]", |
|
cls_token="[CLS]", |
|
sep_token="[SEP]", |
|
mask_token="[MASK]", |
|
clean_up_tokenization_spaces=False |
|
) |
|
return fast_tokenizer |
|
|