ylacombe's picture
ylacombe HF staff
Upload tokenizer.json with huggingface_hub (#2)
c6bd200
raw history blame
No virus
2.99 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 58,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Lowercase"
},
{
"type": "Replace",
"pattern": {
"Regex": "[^3\u0b85\u0b88\u0b89_\u0ba8\u0b95 \u0bc7\u0bbe\u0bc6\u0b8a\u0bc0'7a\u0b8f\u0bb5\u0bcd\u0bb9\u0bc2\u0bb2\u0b9f2\u0bb4\u0bb0\u0bae\u0b92\u0bb1\u0b9e\u0b9a9\u0bbf\u0bb8\u0bc8\u0ba96\u0b8e\u0b87\u0baf\u0bcc\u0ba4\u0bcb\u0bb31\u0b86\u0baa0\u0ba3\u0b93\u0b99\u0bc1\u0b90\u0b9c4\u0bb7\u0bca5]"
},
"content": ""
},
{
"type": "Strip",
"strip_left": true,
"strip_right": true
},
{
"type": "Replace",
"pattern": {
"Regex": "(?=.)|(?<!^)$"
},
"content": "3"
}
]
},
"pre_tokenizer": {
"type": "Split",
"pattern": {
"Regex": ""
},
"behavior": "Isolated",
"invert": false
},
"post_processor": null,
"decoder": null,
"model": {
"vocab": {
"3": 0,
"\u0b85": 1,
"\u0b88": 2,
"\u0b89": 3,
"_": 4,
"\u0ba8": 5,
"\u0b95": 6,
" ": 7,
"\u0bc7": 8,
"\u0bbe": 9,
"\u0bc6": 10,
"\u0b8a": 11,
"\u0bc0": 12,
"'": 13,
"7": 14,
"a": 15,
"\u0b8f": 16,
"\u0bb5": 17,
"\u0bcd": 18,
"\u0bb9": 19,
"\u0bc2": 20,
"\u0bb2": 21,
"\u0b9f": 22,
"2": 23,
"\u0bb4": 24,
"\u0bb0": 25,
"\u0bae": 26,
"\u0b92": 27,
"\u0bb1": 28,
"\u0b9e": 29,
"\u0b9a": 30,
"9": 31,
"\u0bbf": 32,
"\u0bb8": 33,
"\u0bc8": 34,
"\u0ba9": 35,
"6": 36,
"\u0b8e": 37,
"\u0b87": 38,
"\u0baf": 39,
"\u0bcc": 40,
"\u0ba4": 41,
"\u0bcb": 42,
"\u0bb3": 43,
"1": 44,
"\u0b86": 45,
"\u0baa": 46,
"0": 47,
"\u0ba3": 48,
"\u0b93": 49,
"\u0b99": 50,
"\u0bc1": 51,
"\u0b90": 52,
"\u0b9c": 53,
"4": 54,
"\u0bb7": 55,
"\u0bca": 56,
"5": 57,
"<unk>": 58
}
}
}