robot-test commited on
Commit
ae57c41
1 Parent(s): ac5bf91

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +58 -0
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Toy Wordlevel Tokenizer created for testing.
2
+
3
+ Code used for its creation:
4
+
5
+ ```
6
+ from tokenizers import Tokenizer, normalizers, pre_tokenizers
7
+ from tokenizers.models import WordLevel
8
+ from tokenizers.normalizers import NFD, Lowercase, StripAccents
9
+ from tokenizers.pre_tokenizers import Digits, Whitespace
10
+ from tokenizers.processors import TemplateProcessing
11
+ from tokenizers.trainers import WordLevelTrainer
12
+
13
+
14
+ SMALL_TRAINING_CORPUS = [
15
+ ["This is the first sentence.", "This is the second one."],
16
+ ["This sentence (contains #) over symbols and numbers 12 3.", "But not this one."],
17
+ ]
18
+
19
+ tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
20
+ tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
21
+
22
+ tokenizer.pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
23
+
24
+ tokenizer.post_processor = TemplateProcessing(
25
+ single="[CLS] $A [SEP]",
26
+ pair="[CLS] $A [SEP] $B:1 [SEP]:1",
27
+ special_tokens=[
28
+ ("[CLS]", 1),
29
+ ("[SEP]", 2),
30
+ ],
31
+ )
32
+
33
+ trainer = WordLevelTrainer(vocab_size=100, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
34
+
35
+ tokenizer.train_from_iterator(SMALL_TRAINING_CORPUS, trainer=trainer)
36
+
37
+ tokenizer.save("tokenizer.json")
38
+ ```
39
+
40
+ ```
41
+ from transformers import PreTrainedTokenizerFast
42
+
43
+ tokenizer = PreTrainedTokenizerFast(
44
+ tokenizer_file="tokenizer.json",
45
+ bos_token="[CLS]",
46
+ eos_token="[SEP]",
47
+ unk_token="[UNK]",
48
+ sep_token="[SEP]",
49
+ pad_token="[PAD]",
50
+ cls_token= "[CLS]",
51
+ mask_token="[MASK]",
52
+ model_max_length=10,
53
+ padding_side="right"
54
+
55
+ )
56
+
57
+ tokenizer.push_to_hub('dummy-tokenizer-wordlevel', commit_message="add tokenizer")
58
+ ```