Kaspar Beelen commited on
Commit
cd57bf1
1 Parent(s): ad792cc

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[1801]": 30579,
3
+ "[1802]": 30591,
4
+ "[1803]": 30577,
5
+ "[1804]": 30589,
6
+ "[1805]": 30596,
7
+ "[1806]": 30547,
8
+ "[1807]": 30532,
9
+ "[1808]": 30555,
10
+ "[1809]": 30530,
11
+ "[1810]": 30593,
12
+ "[1811]": 30584,
13
+ "[1812]": 30581,
14
+ "[1813]": 30553,
15
+ "[1814]": 30573,
16
+ "[1815]": 30536,
17
+ "[1816]": 30568,
18
+ "[1817]": 30587,
19
+ "[1818]": 30570,
20
+ "[1819]": 30586,
21
+ "[1820]": 30578,
22
+ "[1821]": 30597,
23
+ "[1822]": 30557,
24
+ "[1823]": 30561,
25
+ "[1824]": 30566,
26
+ "[1825]": 30569,
27
+ "[1826]": 30595,
28
+ "[1827]": 30580,
29
+ "[1828]": 30594,
30
+ "[1829]": 30582,
31
+ "[1830]": 30583,
32
+ "[1831]": 30534,
33
+ "[1832]": 30588,
34
+ "[1833]": 30590,
35
+ "[1834]": 30539,
36
+ "[1835]": 30565,
37
+ "[1836]": 30567,
38
+ "[1837]": 30549,
39
+ "[1838]": 30585,
40
+ "[1839]": 30592,
41
+ "[1840]": 30562,
42
+ "[1841]": 30541,
43
+ "[1842]": 30575,
44
+ "[1843]": 30598,
45
+ "[1844]": 30552,
46
+ "[1845]": 30554,
47
+ "[1846]": 30544,
48
+ "[1847]": 30558,
49
+ "[1848]": 30533,
50
+ "[1849]": 30531,
51
+ "[1850]": 30543,
52
+ "[1851]": 30559,
53
+ "[1852]": 30550,
54
+ "[1853]": 30551,
55
+ "[1854]": 30556,
56
+ "[1855]": 30542,
57
+ "[1856]": 30548,
58
+ "[1857]": 30563,
59
+ "[1858]": 30571,
60
+ "[1859]": 30529,
61
+ "[1860]": 30564,
62
+ "[1861]": 30538,
63
+ "[1862]": 30537,
64
+ "[1863]": 30546,
65
+ "[1864]": 30572,
66
+ "[1865]": 30535,
67
+ "[1866]": 30545,
68
+ "[1867]": 30560,
69
+ "[1868]": 30540,
70
+ "[1869]": 30576,
71
+ "[1870]": 30574,
72
+ "[1871]": 30599,
73
+ "[LOC]": 30603,
74
+ "[MET]": 30600,
75
+ "[POL]": 30602,
76
+ "[YEAR]": 30601,
77
+ "[con]": 30523,
78
+ "[lib]": 30522,
79
+ "[liverpool]": 30528,
80
+ "[london]": 30527,
81
+ "[neutr]": 30526,
82
+ "[none]": 30524,
83
+ "[rad]": 30525
84
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[MET]",
4
+ "[YEAR]",
5
+ "[POL]",
6
+ "[LOC]"
7
+ ],
8
+ "cls_token": "[CLS]",
9
+ "mask_token": "[MASK]",
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "unk_token": "[UNK]"
13
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "name_or_path": "erwt-year-st",
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "special_tokens_map_file": null,
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "DistilBertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff