Severino commited on
Commit
cdac438
1 Parent(s): 6f11dfa

solved tokenizer-vocab incompatibility

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +51 -1
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +41 -38
  4. vocab.json +0 -0
special_tokens_map.json CHANGED
@@ -1 +1,51 @@
1
- {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1,63 +1,66 @@
1
  {
2
- "unk_token": {
3
- "content": "<unk>",
4
- "single_word": false,
 
5
  "lstrip": false,
6
- "rstrip": false,
7
  "normalized": true,
8
- "__type": "AddedToken"
 
9
  },
10
- "bos_token": {
 
11
  "content": "<s>",
12
- "single_word": false,
13
  "lstrip": false,
14
- "rstrip": false,
15
  "normalized": true,
16
- "__type": "AddedToken"
 
17
  },
18
  "eos_token": {
 
19
  "content": "</s>",
20
- "single_word": false,
21
  "lstrip": false,
22
- "rstrip": false,
23
  "normalized": true,
24
- "__type": "AddedToken"
 
25
  },
26
- "add_prefix_space": false,
27
  "errors": "replace",
28
- "sep_token": {
29
- "content": "</s>",
30
- "single_word": false,
31
- "lstrip": false,
32
- "rstrip": false,
33
  "normalized": true,
34
- "__type": "AddedToken"
35
- },
36
- "cls_token": {
37
- "content": "<s>",
38
- "single_word": false,
39
- "lstrip": false,
40
  "rstrip": false,
41
- "normalized": true,
42
- "__type": "AddedToken"
43
  },
 
 
 
44
  "pad_token": {
 
45
  "content": "<pad>",
46
- "single_word": false,
47
  "lstrip": false,
48
- "rstrip": false,
49
  "normalized": true,
50
- "__type": "AddedToken"
51
- },
52
- "mask_token": {
53
- "content": "<mask>",
54
- "single_word": false,
55
- "lstrip": true,
56
  "rstrip": false,
 
 
 
 
 
 
57
  "normalized": true,
58
- "__type": "AddedToken"
 
59
  },
60
- "max_len": 512,
61
  "special_tokens_map_file": null,
62
- "name_or_path": "/gpfs/projects/bsc88/tools/corpus-utils-lm/17-06-2021-python/output/bne_es_output/roberta-2021-06-17-1849-3a6a-9c4f/train_tokenizer_output/train-tokenizer-2021-06-17-2216-3a6a-3cd6"
63
- }
 
 
 
 
 
 
 
 
 
1
  {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
  "lstrip": false,
 
7
  "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
  },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
  "content": "<s>",
 
14
  "lstrip": false,
 
15
  "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
  },
19
  "eos_token": {
20
+ "__type": "AddedToken",
21
  "content": "</s>",
 
22
  "lstrip": false,
 
23
  "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
  },
 
27
  "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
 
32
  "normalized": true,
 
 
 
 
 
 
33
  "rstrip": false,
34
+ "single_word": false
 
35
  },
36
+ "max_len": 512,
37
+ "model_max_length": 512,
38
+ "name_or_path": "./roberta-base-bne/",
39
  "pad_token": {
40
+ "__type": "AddedToken",
41
  "content": "<pad>",
 
42
  "lstrip": false,
 
43
  "normalized": true,
 
 
 
 
 
 
44
  "rstrip": false,
45
+ "single_word": false
46
+ },
47
+ "sep_token": {
48
+ "__type": "AddedToken",
49
+ "content": "</s>",
50
+ "lstrip": false,
51
  "normalized": true,
52
+ "rstrip": false,
53
+ "single_word": false
54
  },
 
55
  "special_tokens_map_file": null,
56
+ "tokenizer_class": "RobertaTokenizer",
57
+ "trim_offsets": true,
58
+ "unk_token": {
59
+ "__type": "AddedToken",
60
+ "content": "<unk>",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ }
66
+ }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff