umarzein commited on
Commit
4e070c6
·
1 Parent(s): 43481ea

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer.json +162 -0
  3. tokenizer_config.json +6 -0
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenizer.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "special": false,
9
+ "content": "<>",
10
+ "single_word": false,
11
+ "lstrip": false,
12
+ "rstrip": false,
13
+ "normalized": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "special": false,
18
+ "content": "bob",
19
+ "single_word": false,
20
+ "lstrip": false,
21
+ "rstrip": false,
22
+ "normalized": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "special": false,
27
+ "content": "tom",
28
+ "single_word": false,
29
+ "lstrip": false,
30
+ "rstrip": false,
31
+ "normalized": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "special": false,
36
+ "content": "bike",
37
+ "single_word": false,
38
+ "lstrip": false,
39
+ "rstrip": false,
40
+ "normalized": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "special": false,
45
+ "content": "speech",
46
+ "single_word": false,
47
+ "lstrip": false,
48
+ "rstrip": false,
49
+ "normalized": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "special": false,
54
+ "content": "take",
55
+ "single_word": false,
56
+ "lstrip": false,
57
+ "rstrip": false,
58
+ "normalized": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "special": false,
63
+ "content": "use",
64
+ "single_word": false,
65
+ "lstrip": false,
66
+ "rstrip": false,
67
+ "normalized": true
68
+ },
69
+ {
70
+ "id": 7,
71
+ "special": false,
72
+ "content": "talk",
73
+ "single_word": false,
74
+ "lstrip": false,
75
+ "rstrip": false,
76
+ "normalized": true
77
+ },
78
+ {
79
+ "id": 8,
80
+ "special": false,
81
+ "content": "go",
82
+ "single_word": false,
83
+ "lstrip": false,
84
+ "rstrip": false,
85
+ "normalized": true
86
+ },
87
+ {
88
+ "id": 9,
89
+ "special": false,
90
+ "content": "good",
91
+ "single_word": false,
92
+ "lstrip": false,
93
+ "rstrip": false,
94
+ "normalized": true
95
+ },
96
+ {
97
+ "id": 10,
98
+ "special": false,
99
+ "content": "active",
100
+ "single_word": false,
101
+ "lstrip": false,
102
+ "rstrip": false,
103
+ "normalized": true
104
+ },
105
+ {
106
+ "id": 11,
107
+ "special": false,
108
+ "content": "not",
109
+ "single_word": false,
110
+ "lstrip": false,
111
+ "rstrip": false,
112
+ "normalized": true
113
+ },
114
+ {
115
+ "id": 12,
116
+ "special": false,
117
+ "content": "and",
118
+ "single_word": false,
119
+ "lstrip": false,
120
+ "rstrip": false,
121
+ "normalized": true
122
+ },
123
+ {
124
+ "id": 13,
125
+ "special": false,
126
+ "content": "then",
127
+ "single_word": false,
128
+ "lstrip": false,
129
+ "rstrip": false,
130
+ "normalized": true
131
+ },
132
+ {
133
+ "id": 14,
134
+ "special": false,
135
+ "content": "but",
136
+ "single_word": false,
137
+ "lstrip": false,
138
+ "rstrip": false,
139
+ "normalized": true
140
+ },
141
+ {
142
+ "id": 15,
143
+ "special": false,
144
+ "content": ".",
145
+ "single_word": false,
146
+ "lstrip": false,
147
+ "rstrip": false,
148
+ "normalized": true
149
+ }
150
+ ],
151
+ "normalizer": null,
152
+ "pre_tokenizer": {
153
+ "type": "Whitespace"
154
+ },
155
+ "post_processor": null,
156
+ "decoder": null,
157
+ "model": {
158
+ "type": "WordLevel",
159
+ "vocab": {},
160
+ "unk_token": "<>"
161
+ }
162
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "model_max_length": 1000000000000000019884624838656,
4
+ "tokenizer_class": "PreTrainedTokenizerFast",
5
+ "vocab_size": 16
6
+ }