| """ |
| Hugging Face tokenizer class for MINDI 1.0 420M. |
| """ |
|
|
| from pathlib import Path |
| from transformers import PreTrainedTokenizerFast |
|
|
|
|
| class MindiTokenizer(PreTrainedTokenizerFast): |
| vocab_files_names = {"tokenizer_file": "tokenizer.json"} |
| model_input_names = ["input_ids", "attention_mask"] |
|
|
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): |
| if kwargs.get("tokenizer_file") is None: |
| local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json" |
| if local_candidate.exists(): |
| kwargs["tokenizer_file"] = str(local_candidate) |
| return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) |
|
|
| def __init__(self, tokenizer_file=None, **kwargs): |
| name_or_path = kwargs.pop("name_or_path", None) |
| if tokenizer_file is None and name_or_path is not None: |
| candidate = Path(name_or_path) / "tokenizer.json" |
| if candidate.exists(): |
| tokenizer_file = str(candidate) |
| if tokenizer_file is None: |
| tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json") |
| kwargs.setdefault("bos_token", "<BOS>") |
| kwargs.setdefault("eos_token", "<EOS>") |
| kwargs.setdefault("unk_token", "<UNK>") |
| kwargs.setdefault("pad_token", "<PAD>") |
| super().__init__(tokenizer_file=tokenizer_file, **kwargs) |
|
|