{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "UNK", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "PAD", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 55, "content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": { "type": "Sequence", "normalizers": [ { "type": "Replace", "pattern": { "String": "\n" }, "content": " UTT_BOUNDARY" }, { "type": "Strip", "strip_left": true, "strip_right": true } ] }, "pre_tokenizer": { "type": "Whitespace" }, "post_processor": null, "decoder": null, "model": { "type": "WordLevel", "vocab": { "UNK": 0, "PAD": 1, "BOS": 2, "EOS": 3, "WORD_BOUNDARY": 4, "UTT_BOUNDARY": 5, "m": 6, "ɛ": 7, "ɲ": 8, "y": 9, "n": 10, "k": 11, "ɑ": 12, "r": 13, "aː": 14, "d": 15, "i": 16, "o": 17, "h": 18, "z": 19, "v": 20, "l": 21, "eː": 22, "j": 23, "ʃ": 24, "ɟ": 25, "s": 26, "oː": 27, "p": 28, "t": 29, "tsː": 30, "b": 31, "u": 32, "ɡ": 33, "tː": 34, "f": 35, "ø": 36, "t̠ʃ": 37, "uː": 38, "iː": 39, "ts": 40, "ɟː": 41, "yː": 42, "øː": 43, "ʎ": 44, "t̠ʃː": 45, "c": 46, "ɡː": 47, "kː": 48, "ɑː": 49, "dː": 50, "pː": 51, "ʒ": 52, "cː": 53, "bː": 54 }, "unk_token": "UNK" } }