diff --git "a/tokenizer_config.json" "b/tokenizer_config.json" new file mode 100644--- /dev/null +++ "b/tokenizer_config.json" @@ -0,0 +1,24113 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "kanggo", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32001": { + "content": "bisa", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32002": { + "content": "saka", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32003": { + "content": "dina", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32004": { + "content": "sareng", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32005": { + "content": "bakal", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32006": { + "content": "karo", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32007": { + "content": "pikeun", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32008": { + "content": "kuwi", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32009": { + "content": "jeung", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32010": { + "content": "wong", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32011": { + "content": "paling", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32012": { + "content": "wis", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32013": { + "content": "utawa", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32014": { + "content": "sampeyan", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32015": { + "content": "dadi", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32016": { + "content": "anjeun", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32017": { + "content": "kang", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32018": { + "content": "kanthi", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32019": { + "content": "luwih", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32020": { + "content": "mung", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32021": { + "content": "taun", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32022": { + "content": "minangka", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32023": { + "content": "Aku", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32024": { + "content": "tapi", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32025": { + "content": "meuble", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32026": { + "content": "nanging", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32027": { + "content": "urang", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32028": { + "content": "kita", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32029": { + "content": "salah", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32030": { + "content": "kudu", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32031": { + "content": "bokep", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32032": { + "content": "nganggo", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32033": { + "content": "sarta", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32034": { + "content": "teu", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32035": { + "content": "tina", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32036": { + "content": "tak", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32037": { + "content": "apik", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32038": { + "content": "siji", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32039": { + "content": "ngentot", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32040": { + "content": "nalika", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32041": { + "content": "kaya", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32042": { + "content": "jadi", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32043": { + "content": "padha", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32044": { + "content": "kana", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32045": { + "content": "cara", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32046": { + "content": "hiji", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "32047": { + "content": "/>", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 4096, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +}