SaulLu commited on
Commit
67b371b
1 Parent(s): eebcad8

add modified albert tokenizer

Browse files
Files changed (3) hide show
  1. README.md +16 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +3 -0
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ The purpose of this repo is to show the usefulness of saving the normalization operation used during the tokenizer training
3
+
4
+ ```python
5
+ from transformers import AutoTokenizer
6
+
7
+ text = "This is a text with àccënts and CAPITAL LETTERS"
8
+ tokenizer = AutoTokenizer.from_pretrained("albert-large-v2")
9
+ print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
10
+
11
+ # ['[CLS]', '▁this', '▁is', '▁a', '▁text', '▁with', '▁accent', 's', '▁and', '▁capital', '▁letters', '[SEP]']
12
+ tokenizer = AutoTokenizer.from_pretrained("huggingface-course/albert-tokenizer-without-normalizer")
13
+ print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
14
+ #
15
+ ['[CLS]', '▁', '<unk>', 'his', '▁is', '▁a', '▁text', '▁with', '▁', '<unk>', 'cc', '<unk>', 'nts', '▁and', '▁', '<unk>', '▁', '<unk>', '[SEP]']
16
+ ```
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ {
2
+ "tokenizer_class": "AlbertTokenizer"
3
+ }