""" """ import os from transformers import LlamaTokenizer from vocab import TokenizerType, TokenizerImpl CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer") tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR) tokenizer.parent = "" tokenizer.type = TokenizerType.ByteBPE.name tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py tokenizer.comments = "split all numbers into individual digits, " \ "and fallback to bytes to decompose unknown UTF-8 characters"