benjaminn commited on
Commit
12bd715
1 Parent(s): fbd24af

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +86 -0
  4. vocab.txt +0 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "id2label": {
7
+ "0": "O",
8
+ "1": "B-Title",
9
+ "10": "B-Algorithm",
10
+ "11": "B-Figure",
11
+ "12": "B-Table",
12
+ "13": "B-Caption",
13
+ "14": "B-Header",
14
+ "15": "B-Footer",
15
+ "16": "B-Footnote",
16
+ "17": "I-Title",
17
+ "18": "I-Author",
18
+ "19": "I-Abstract",
19
+ "2": "B-Author",
20
+ "20": "I-Keywords",
21
+ "21": "I-Section",
22
+ "22": "I-Paragraph",
23
+ "23": "I-List",
24
+ "24": "I-Bibliography",
25
+ "25": "I-Equation",
26
+ "26": "I-Algorithm",
27
+ "27": "I-Figure",
28
+ "28": "I-Table",
29
+ "29": "I-Caption",
30
+ "3": "B-Abstract",
31
+ "30": "I-Header",
32
+ "31": "I-Footer",
33
+ "32": "I-Footnote",
34
+ "4": "B-Keywords",
35
+ "5": "B-Section",
36
+ "6": "B-Paragraph",
37
+ "7": "B-List",
38
+ "8": "B-Bibliography",
39
+ "9": "B-Equation"
40
+ },
41
+ "label2id": {
42
+ "B-Abstract": 3,
43
+ "B-Algorithm": 10,
44
+ "B-Author": 2,
45
+ "B-Bibliography": 8,
46
+ "B-Caption": 13,
47
+ "B-Equation": 9,
48
+ "B-Figure": 11,
49
+ "B-Footer": 15,
50
+ "B-Footnote": 16,
51
+ "B-Header": 14,
52
+ "B-Keywords": 4,
53
+ "B-List": 7,
54
+ "B-Paragraph": 6,
55
+ "B-Section": 5,
56
+ "B-Table": 12,
57
+ "B-Title": 1,
58
+ "I-Abstract": 19,
59
+ "I-Algorithm": 26,
60
+ "I-Author": 18,
61
+ "I-Bibliography": 24,
62
+ "I-Caption": 29,
63
+ "I-Equation": 25,
64
+ "I-Figure": 27,
65
+ "I-Footer": 31,
66
+ "I-Footnote": 32,
67
+ "I-Header": 30,
68
+ "I-Keywords": 20,
69
+ "I-List": 23,
70
+ "I-Paragraph": 22,
71
+ "I-Section": 21,
72
+ "I-Table": 28,
73
+ "I-Title": 17,
74
+ "O": 0
75
+ },
76
+ "mask_token": "[MASK]",
77
+ "model_max_length": 1000000000000000019884624838656,
78
+ "never_split": null,
79
+ "num_labels": 33,
80
+ "pad_token": "[PAD]",
81
+ "sep_token": "[SEP]",
82
+ "strip_accents": null,
83
+ "tokenize_chinese_chars": true,
84
+ "tokenizer_class": "BertTokenizer",
85
+ "unk_token": "[UNK]"
86
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff