Edit model card

Japanese BERT-base (MeCab + WordPiece)

How to load the tokenizer

Please download the dictionary file for MeCab + WordPiece from our GitHub repository. Then you can load the tokenizer by specifying the path of the dictionary file to dict_path.

from typing import Optional

from tokenizers import Tokenizer, NormalizedString, PreTokenizedString
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast

from MeCab import Tagger
import textspan

class MecabPreTokenizer:
    def __init__(self, mecab_dict_path: Optional[str] = None):
        mecab_option = (f"-Owakati -d {mecab_dict_path}" if mecab_dict_path is not None else "-Owakati")
        self.mecab = Tagger(mecab_option)
    
    def tokenize(self, sequence: str) -> list[str]:
        return self.mecab.parse(sequence).strip().split(" ")
    
    def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
        text = str(normalized_string)
        tokens = self.tokenize(text)
        tokens_spans = textspan.get_original_spans(tokens, text)
        return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.custom_split)

# load a tokenizer
dict_path = /path/to/mecab_wordpiece.json
tokenizer = Tokenizer.from_file(dict_path)
# load a pre-tokenizer
pre_tokenizer = MecabPreTokenizer()
tokenizer.post_processor = BertProcessing(
    cls=("[CLS]", tokenizer.token_to_id('[CLS]')),
    sep=("[SEP]", tokenizer.token_to_id('[SEP]'))
)
# convert to PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token='[UNK]',
    cls_token='[CLS]',
    sep_token='[SEP]',
    pad_token='[PAD]',
    mask_token='[MASK]'
)
# set a pre-tokenizer
tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer)
# Test
test_str = "γ“γ‚“γ«γ‘γ―γ€‚η§γ―ε½’ζ…‹η΄ θ§£ζžε™¨γ«γ€γ„γ¦η ”η©Άγ‚’γ—γ¦γ„γΎγ™γ€‚"
tokenizer.convert_ids_to_tokens(tokenizer(test_str).input_ids)
# -> ['[CLS]','こ','##γ‚“','##に','##け','##は','。','私','は','ε½’ζ…‹','##η΄ ','解','##析','器','に぀いて','η ”η©Ά','γ‚’','し','て','い','ます','。','[SEP]']

How to load the model

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("hitachi-nlp/bert-base_mecab-wordpiece")

See our repository for more details!

Downloads last month
3
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Datasets used to train hitachi-nlp/bert-base-japanese_mecab-wordpiece