diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,103835 @@ +{ + "version": "1.0", + "truncation": { + "direction": "Right", + "max_length": 512, + "strategy": "LongestFirst", + "stride": 0 + }, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": true, + "rstrip": false, + "normalized": true, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "RobertaProcessing", + "sep": [ + "", + 2 + ], + "cls": [ + "", + 0 + ], + "trim_offsets": true, + "add_prefix_space": false + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "!": 5, + "\"": 6, + "#": 7, + "$": 8, + "%": 9, + "&": 10, + "'": 11, + "(": 12, + ")": 13, + "*": 14, + "+": 15, + ",": 16, + "-": 17, + ".": 18, + "/": 19, + "0": 20, + "1": 21, + "2": 22, + "3": 23, + "4": 24, + "5": 25, + "6": 26, + "7": 27, + "8": 28, + "9": 29, + ":": 30, + ";": 31, + "<": 32, + "=": 33, + ">": 34, + "?": 35, + "@": 36, + "A": 37, + "B": 38, + "C": 39, + "D": 40, + "E": 41, + "F": 42, + "G": 43, + "H": 44, + "I": 45, + "J": 46, + "K": 47, + "L": 48, + "M": 49, + "N": 50, + "O": 51, + "P": 52, + "Q": 53, + "R": 54, + "S": 55, + "T": 56, + "U": 57, + "V": 58, + "W": 59, + "X": 60, + "Y": 61, + "Z": 62, + "[": 63, + "\\": 64, + "]": 65, + "^": 66, + "_": 67, + "`": 68, + "a": 69, + "b": 70, + "c": 71, + "d": 72, + "e": 73, + "f": 74, + "g": 75, + "h": 76, + "i": 77, + "j": 78, + "k": 79, + "l": 80, + "m": 81, + "n": 82, + "o": 83, + "p": 84, + "q": 85, + "r": 86, + "s": 87, + "t": 88, + "u": 89, + "v": 90, + "w": 91, + "x": 92, + "y": 93, + "z": 94, + "{": 95, + "|": 96, + "}": 97, + "~": 98, + "¡": 99, + "¢": 100, + "£": 101, + "¤": 102, + "¥": 103, + "¦": 104, + "§": 105, + "¨": 106, + "©": 107, + "ª": 108, + "«": 109, + "¬": 110, + "®": 111, + "¯": 112, + "°": 113, + "±": 114, + "²": 115, + "³": 116, + "´": 117, + "µ": 118, + "¶": 119, + "·": 120, + "¸": 121, + "¹": 122, + "º": 123, + "»": 124, + "¼": 125, + "½": 126, + "¾": 127, + "¿": 128, + "À": 129, + "Á": 130, + "Â": 131, + "Ã": 132, + "Ä": 133, + "Å": 134, + "Æ": 135, + "Ç": 136, + "È": 137, + "É": 138, + "Ê": 139, + "Ë": 140, + "Ì": 141, + "Í": 142, + "Î": 143, + "Ï": 144, + "Ð": 145, + "Ñ": 146, + "Ò": 147, + "Ó": 148, + "Ô": 149, + "Õ": 150, + "Ö": 151, + "×": 152, + "Ø": 153, + "Ù": 154, + "Ú": 155, + "Û": 156, + "Ü": 157, + "Ý": 158, + "Þ": 159, + "ß": 160, + "à": 161, + "á": 162, + "â": 163, + "ã": 164, + "ä": 165, + "å": 166, + "æ": 167, + "ç": 168, + "è": 169, + "é": 170, + "ê": 171, + "ë": 172, + "ì": 173, + "í": 174, + "î": 175, + "ï": 176, + "ð": 177, + "ñ": 178, + "ò": 179, + "ó": 180, + "ô": 181, + "õ": 182, + "ö": 183, + "÷": 184, + "ø": 185, + "ù": 186, + "ú": 187, + "û": 188, + "ü": 189, + "ý": 190, + "þ": 191, + "ÿ": 192, + "Ā": 193, + "ā": 194, + "Ă": 195, + "ă": 196, + "Ą": 197, + "ą": 198, + "Ć": 199, + "ć": 200, + "Ĉ": 201, + "ĉ": 202, + "Ċ": 203, + "ċ": 204, + "Č": 205, + "č": 206, + "Ď": 207, + "ď": 208, + "Đ": 209, + "đ": 210, + "Ē": 211, + "ē": 212, + "Ĕ": 213, + "ĕ": 214, + "Ė": 215, + "ė": 216, + "Ę": 217, + "ę": 218, + "Ě": 219, + "ě": 220, + "Ĝ": 221, + "ĝ": 222, + "Ğ": 223, + "ğ": 224, + "Ġ": 225, + "ġ": 226, + "Ģ": 227, + "ģ": 228, + "Ĥ": 229, + "ĥ": 230, + "Ħ": 231, + "ħ": 232, + "Ĩ": 233, + "ĩ": 234, + "Ī": 235, + "ī": 236, + "Ĭ": 237, + "ĭ": 238, + "Į": 239, + "į": 240, + "İ": 241, + "ı": 242, + "IJ": 243, + "ij": 244, + "Ĵ": 245, + "ĵ": 246, + "Ķ": 247, + "ķ": 248, + "ĸ": 249, + "Ĺ": 250, + "ĺ": 251, + "Ļ": 252, + "ļ": 253, + "Ľ": 254, + "ľ": 255, + "Ŀ": 256, + "ŀ": 257, + "Ł": 258, + "ł": 259, + "Ń": 260, + "an": 261, + "Ġd": 262, + "er": 263, + "en": 264, + "ar": 265, + "Ġm": 266, + "la": 267, + "ang": 268, + "in": 269, + "Ġs": 270, + "Ġp": 271, + "at": 272, + "Ġk": 273, + "Ġb": 274, + "Ġt": 275, + "da": 276, + "si": 277, + "Ġdi": 278, + "un": 279, + "kan": 280, + "em": 281, + "al": 282, + "ah": 283, + "am": 284, + "ya": 285, + "as": 286, + "ada": 287, + "do": 288, + "yang": 289, + "Ġyang": 290, + "Ġse": 291, + "eng": 292, + "Ġdan": 293, + "on": 294, + "es": 295, + "tu": 296, + "il": 297, + "or": 298, + "ari": 299, + "ga": 300, + "lah": 301, + "ak": 302, + "ĠS": 303, + "is": 304, + "ik": 305, + "us": 306, + "um": 307, + "doc": 308, + "