mms-tts-rus / tokenizer.json
Xenova's picture
Xenova HF staff
Upload tokenizer.json
dc6502a
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 44,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Lowercase"
},
{
"type": "Replace",
"pattern": {
"Regex": "[^\u0447 \u0434\u044c\u044f\u0439\u0441\u0443\u0430\u043e20\u044a\u044b\u043d\u043b\u0436\u0445\u0448\u0437c_\u043fm\u044e\u044d\u04404\u0449o\u2013\u0444q\u0438\u0446\u043a1\u0435\u0431\\-\u0442\u0432\u0433\u043c]"
},
"content": ""
},
{
"type": "Strip",
"strip_left": true,
"strip_right": true
},
{
"type": "Replace",
"pattern": {
"Regex": "(?=.)|(?<!^)$"
},
"content": "\u0447"
}
]
},
"pre_tokenizer": {
"type": "Split",
"pattern": {
"Regex": ""
},
"behavior": "Isolated",
"invert": false
},
"post_processor": null,
"decoder": null,
"model": {
"vocab": {
"\u0447": 0,
" ": 1,
"\u0434": 2,
"\u044c": 3,
"\u044f": 4,
"\u0439": 5,
"\u0441": 6,
"\u0443": 7,
"\u0430": 8,
"\u043e": 9,
"2": 10,
"0": 11,
"\u044a": 12,
"\u044b": 13,
"\u043d": 14,
"\u043b": 15,
"\u0436": 16,
"\u0445": 17,
"\u0448": 18,
"\u0437": 19,
"c": 20,
"_": 21,
"\u043f": 22,
"m": 23,
"\u044e": 24,
"\u044d": 25,
"\u0440": 26,
"4": 27,
"\u0449": 28,
"o": 29,
"\u2013": 30,
"\u0444": 31,
"q": 32,
"\u0438": 33,
"\u0446": 34,
"\u043a": 35,
"1": 36,
"\u0435": 37,
"\u0431": 38,
"-": 39,
"\u0442": 40,
"\u0432": 41,
"\u0433": 42,
"\u043c": 43,
"<unk>": 44
}
}
}