ShreyaR commited on
Commit
9d26035
1 Parent(s): e1eda83

add tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +11 -13
  2. tokenizer_config.json +1 -1
tokenizer.json CHANGED
@@ -7,9 +7,7 @@
7
  "stride": 0
8
  },
9
  "padding": {
10
- "strategy": {
11
- "Fixed": 512
12
- },
13
  "direction": "Right",
14
  "pad_to_multiple_of": null,
15
  "pad_id": 1,
@@ -19,48 +17,48 @@
19
  "added_tokens": [
20
  {
21
  "id": 0,
22
- "special": true,
23
  "content": "<s>",
24
  "single_word": false,
25
  "lstrip": false,
26
  "rstrip": false,
27
- "normalized": false
 
28
  },
29
  {
30
  "id": 1,
31
- "special": true,
32
  "content": "<pad>",
33
  "single_word": false,
34
  "lstrip": false,
35
  "rstrip": false,
36
- "normalized": false
 
37
  },
38
  {
39
  "id": 2,
40
- "special": true,
41
  "content": "</s>",
42
  "single_word": false,
43
  "lstrip": false,
44
  "rstrip": false,
45
- "normalized": false
 
46
  },
47
  {
48
  "id": 3,
49
- "special": true,
50
  "content": "<unk>",
51
  "single_word": false,
52
  "lstrip": false,
53
  "rstrip": false,
54
- "normalized": false
 
55
  },
56
  {
57
  "id": 50264,
58
- "special": true,
59
  "content": "<mask>",
60
  "single_word": false,
61
  "lstrip": true,
62
  "rstrip": false,
63
- "normalized": false
 
64
  }
65
  ],
66
  "normalizer": null,
 
7
  "stride": 0
8
  },
9
  "padding": {
10
+ "strategy": "BatchLongest",
 
 
11
  "direction": "Right",
12
  "pad_to_multiple_of": null,
13
  "pad_id": 1,
 
17
  "added_tokens": [
18
  {
19
  "id": 0,
 
20
  "content": "<s>",
21
  "single_word": false,
22
  "lstrip": false,
23
  "rstrip": false,
24
+ "normalized": false,
25
+ "special": true
26
  },
27
  {
28
  "id": 1,
 
29
  "content": "<pad>",
30
  "single_word": false,
31
  "lstrip": false,
32
  "rstrip": false,
33
+ "normalized": false,
34
+ "special": true
35
  },
36
  {
37
  "id": 2,
 
38
  "content": "</s>",
39
  "single_word": false,
40
  "lstrip": false,
41
  "rstrip": false,
42
+ "normalized": false,
43
+ "special": true
44
  },
45
  {
46
  "id": 3,
 
47
  "content": "<unk>",
48
  "single_word": false,
49
  "lstrip": false,
50
  "rstrip": false,
51
+ "normalized": false,
52
+ "special": true
53
  },
54
  {
55
  "id": 50264,
 
56
  "content": "<mask>",
57
  "single_word": false,
58
  "lstrip": true,
59
  "rstrip": false,
60
+ "normalized": false,
61
+ "special": true
62
  }
63
  ],
64
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "trim_offsets": true, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "roberta-base", "tokenizer_class": "RobertaTokenizer"}
 
1
+ {"errors": "replace", "bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>", "add_prefix_space": false, "trim_offsets": true, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "roberta-base", "tokenizer_class": "RobertaTokenizer"}