jrahn
/

ROOK-CLF-9m

+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+from transformers import (
+    LlamaConfig, LlamaForSequenceClassification, LlamaForCausalLM,
+    GPT2Config, GPT2ForSequenceClassification, GPT2LMHeadModel,
+    PreTrainedTokenizerFast
+)
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from src.const import ACTION_SPACE, VOCAB
+class RookTokenizer(PreTrainedTokenizerFast):
+    # TODO: make it easier to use checkpoints from the hub
+    # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
+    def __call__(self, *args, **kwargs):
+        kwargs["return_token_type_ids"] = False
+        return super().__call__(*args, **kwargs)
+def make_model(config_dict, arch="llama"):
+    if config_dict["finetuning_task"] == "text-classification":
+        return make_model_clf(config_dict, arch=arch)
+    elif config_dict["finetuning_task"] == "text-generation":
+        return make_model_lm(config_dict, arch=arch)
+    else:
+        raise ValueError(f"Unknown config finetuning_task: {config_dict['finetuning_task']}")
+def make_model_clf(config_dict, arch):
+    if arch == "llama":
+        Config = LlamaConfig
+        Model = LlamaForSequenceClassification
+    if arch == "gpt2":
+        Config = GPT2Config
+        Model = GPT2ForSequenceClassification
+    # pad to multiple of 128
+    config_dict["vocab_size"] = ((len(VOCAB) + 127) // 128) * 128
+    config = Config(**config_dict)
+    label_to_id = {v: i for i, v in enumerate(ACTION_SPACE)}
+    config.num_labels = len(ACTION_SPACE)
+    config.label2id = label_to_id
+    config.id2label = {id: label for label, id in label_to_id.items()}
+    model = Model(config=config)
+    return model
+def make_model_lm(config_dict, arch):
+    if arch == "llama":
+        Config = LlamaConfig
+        Model = LlamaForCausalLM
+    if arch == "gpt2":
+        Config = GPT2Config
+        Model = GPT2LMHeadModel
+    # pad to multiple of 128
+    config_dict["vocab_size"] = ((len(VOCAB) + len(ACTION_SPACE) + 4 + 127) // 128) * 128
+    config = Config(**config_dict)
+    model = Model(config=config)
+    return model
+def make_tokenizer(task="clf"):
+    if task == "clf":
+        return make_tokenizer_clf(model_max_length=78)
+    elif task == "lm":
+        return make_tokenizer_lm(model_max_length=79)
+    elif task == "lm-cot":
+        return make_tokenizer_lm(model_max_length=116)
+    else:
+        raise ValueError(f"Unknown task: {task}")
+def make_tokenizer_clf(model_max_length):
+    single_char_vocab = [e for e in VOCAB if len(e) == 1]
+    multi_char_vocab = [e for e in VOCAB if len(e) > 1]
+    merges = [tuple(e) for e in multi_char_vocab]
+    print(merges[:5])
+    tokenizer = Tokenizer(BPE(
+        vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))),
+        merges=merges)
+    )
+    fast_tokenizer = RookTokenizer(
+        tokenizer_object=tokenizer,
+        model_max_length=model_max_length,
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+        clean_up_tokenization_spaces=False
+    )
+    return fast_tokenizer
+def make_tokenizer_lm(model_max_length):
+    vocab = VOCAB + ACTION_SPACE
+    vocab += ["[OPTIONS]", "[VALUES]", "[ACTION]", "0000"]
+    single_char_vocab = [e for e in vocab if len(e) == 1]
+    multi_char_vocab = [e for e in vocab if len(e) > 1]
+    merges = []
+    tokenizer = Tokenizer(BPE(
+        vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))),
+        merges=merges)
+    )
+    tokenizer.add_special_tokens(multi_char_vocab)
+    fast_tokenizer = RookTokenizer(
+        tokenizer_object=tokenizer,
+        model_max_length=model_max_length,
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+        clean_up_tokenization_spaces=False
+    )
+    return fast_tokenizer

tokenizer_config.json CHANGED Viewed

@@ -33,6 +33,12 @@
       "special": true
     }
   },
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",

       "special": true
     }
   },
+  "auto_map": {
+    "AutoTokenizer": [
+      "model.RookTokenizer",
+      null
+    ]
+  },
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",