atsuki-yamaguchi commited on
Commit
fafe82f
β€’
1 Parent(s): d1e349c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +90 -0
README.md CHANGED
@@ -1,3 +1,93 @@
1
  ---
2
  license: cc-by-nc-sa-4.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-nc-sa-4.0
3
+ datasets:
4
+ - wikipedia
5
+ - cc100
6
+ language:
7
+ - ja
8
+ library_name: transformers
9
+ pipeline_tag: fill-mask
10
  ---
11
+
12
+ BERT-base (Juman++ + BPE)
13
+ ===
14
+
15
+ ## How to load the tokenizer
16
+ Please download the dictionary file for Juman++ + BPE from [our GitHub repository](https://github.com/hitachi-nlp/compare-ja-tokenizer/blob/public/data/dict/jumanpp_bpe.json).
17
+ Then you can load the tokenizer by specifying the path of the dictionary file to `dict_path`.
18
+
19
+ ```python
20
+ from typing import Optional
21
+
22
+ from tokenizers import Tokenizer, NormalizedString, PreTokenizedString
23
+ from tokenizers.processors import BertProcessing
24
+ from tokenizers.pre_tokenizers import PreTokenizer
25
+ from transformers import PreTrainedTokenizerFast
26
+
27
+ from pyknp import Juman
28
+ import mojimoji
29
+ import textspan
30
+
31
+ class JumanPreTokenizer:
32
+ def __init__(self):
33
+ self.juman = Juman("jumanpp", multithreading=True)
34
+
35
+ def tokenize(self, sequence: str) -> list[str]:
36
+ text = mojimoji.han_to_zen(sequence).rstrip()
37
+ try:
38
+ result = self.juman.analysis(text)
39
+ except:
40
+ traceback.print_exc()
41
+ text = ""
42
+ result = self.juman.analysis(text)
43
+ return [mrph.midasi for mrph in result.mrph_list()]
44
+
45
+ def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
46
+ text = str(normalized_string)
47
+ tokens = self.tokenize(text)
48
+ tokens_spans = textspan.get_original_spans(tokens, text)
49
+ return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
50
+
51
+ def pre_tokenize(self, pretok: PreTokenizedString):
52
+ pretok.split(self.custom_split)
53
+
54
+ # load a pre-tokenizer
55
+ pre_tokenizer = JumanPreTokenizer()
56
+
57
+ # load a tokenizer
58
+ dict_path = /path/to/jumanpp_bpe.json
59
+ tokenizer = Tokenizer.from_file(dict_path)
60
+ tokenizer.post_processor = BertProcessing(
61
+ cls=("[CLS]", tokenizer.token_to_id('[CLS]')),
62
+ sep=("[SEP]", tokenizer.token_to_id('[SEP]'))
63
+ )
64
+
65
+ # convert to PreTrainedTokenizerFast
66
+ tokenizer = PreTrainedTokenizerFast(
67
+ tokenizer_object=tokenizer,
68
+ unk_token='[UNK]',
69
+ cls_token='[CLS]',
70
+ sep_token='[SEP]',
71
+ pad_token='[PAD]',
72
+ mask_token='[MASK]'
73
+ )
74
+
75
+ # set a pre-tokenizer
76
+ tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer)
77
+ ```
78
+
79
+ ```python
80
+ # Test
81
+ test_str = "γ“γ‚“γ«γ‘γ―γ€‚η§γ―ε½’ζ…‹η΄ θ§£ζžε™¨γ«γ€γ„γ¦η ”η©Άγ‚’γ—γ¦γ„γΎγ™γ€‚"
82
+ tokenizer.convert_ids_to_tokens(tokenizer(test_str).input_ids)
83
+ # -> ['[CLS]','こ','んに','け','は','。','私','は','ε½’ζ…‹','η΄ ','解析','器','に','぀いて','η ”η©Ά','γ‚’','して','い','ます','。','[SEP]']
84
+ ```
85
+
86
+ ## How to load the model
87
+ ```python
88
+ from transformers import AutoModelForMaskedLM
89
+ model = AutoModelForMaskedLM.from_pretrained("hitachi-nlp/bert-base_jumanpp-bpe")
90
+ ```
91
+
92
+
93
+ **See [our repository](https://github.com/hitachi-nlp/compare-ja-tokenizer) for more details!**