File size: 1,988 Bytes
473c3a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from __future__ import annotations
import json
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from tokenizers import Tokenizer
_FORBIDDEN_PRETOKENIZERS = (
"WhiteSpace",
"WhitespaceSplit",
"BertPreTokenizer",
"CharDelimiterSplit",
"Punctuation",
"Split",
"UnicodeScripts",
)
_BASIC_METASPACE = {"type": "Metaspace", "replacement": "▁", "prepend_scheme": "always", "split": False}
def _fix_single_pretokenizer(pre_tokenizer: dict[str, Any]) -> dict[str, Any] | None:
"""Fixes a single pretokenizer to allow multiword units."""
if pre_tokenizer["type"] in _FORBIDDEN_PRETOKENIZERS:
return None
if pre_tokenizer["type"] == "ByteLevel":
pre_tokenizer["add_prefix_space"] = True
pre_tokenizer["use_regex"] = False
if pre_tokenizer["type"] == "Metaspace":
pre_tokenizer["split"] = False
pre_tokenizer["prepend_scheme"] = "always"
return pre_tokenizer
def replace_pretokenizer(tokenizer: Tokenizer) -> Tokenizer:
"""Fixes a single pretokenizer to allow multiword units."""
tokenizer_json = json.loads(tokenizer.to_str())
pre_tokenizer_json = tokenizer_json.get("pre_tokenizer", None)
if pre_tokenizer_json is None:
pre_tokenizer_json = _BASIC_METASPACE
elif pre_tokenizer_json["type"] == "Sequence":
new_pretokenizers = []
for single_pretokenizer in pre_tokenizer_json["pretokenizers"]:
new_pretokenizer = _fix_single_pretokenizer(single_pretokenizer)
if new_pretokenizer is not None:
new_pretokenizers.append(new_pretokenizer)
if new_pretokenizers:
pre_tokenizer_json["pretokenizers"] = new_pretokenizers
else:
pre_tokenizer_json = _BASIC_METASPACE
pre_tokenizer_json = _fix_single_pretokenizer(pre_tokenizer_json) or _BASIC_METASPACE
tokenizer_json["pre_tokenizer"] = pre_tokenizer_json
return tokenizer.from_str(json.dumps(tokenizer_json))
|