guillermoruiz commited on
Commit
6575a02
1 Parent(s): 5767724

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +57 -21
tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 280,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 280
12
- },
13
- "direction": "Right",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 0,
16
- "pad_type_id": 0,
17
- "pad_token": "[PAD]"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 0,
@@ -64,11 +50,61 @@
64
  }
65
  ],
66
  "normalizer": {
67
- "type": "BertNormalizer",
68
- "clean_text": true,
69
- "handle_chinese_chars": true,
70
- "strip_accents": null,
71
- "lowercase": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  },
73
  "pre_tokenizer": {
74
  "type": "BertPreTokenizer"
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
50
  }
51
  ],
52
  "normalizer": {
53
+ "type": "Sequence",
54
+ "normalizers": [
55
+ {
56
+ "type": "NFD"
57
+ },
58
+ {
59
+ "type": "Replace",
60
+ "pattern": {
61
+ "Regex": "@\\S+"
62
+ },
63
+ "content": "_USR"
64
+ },
65
+ {
66
+ "type": "Replace",
67
+ "pattern": {
68
+ "Regex": "(http|ftp|https)://\\S+"
69
+ },
70
+ "content": "_URL"
71
+ },
72
+ {
73
+ "type": "Replace",
74
+ "pattern": {
75
+ "Regex": " j(a|e|i)[jaei]+"
76
+ },
77
+ "content": " jaja"
78
+ },
79
+ {
80
+ "type": "Replace",
81
+ "pattern": {
82
+ "Regex": " h(a|e|i)[haei]+"
83
+ },
84
+ "content": " jaja"
85
+ },
86
+ {
87
+ "type": "Replace",
88
+ "pattern": {
89
+ "String": "&"
90
+ },
91
+ "content": "&"
92
+ },
93
+ {
94
+ "type": "Replace",
95
+ "pattern": {
96
+ "String": ">"
97
+ },
98
+ "content": ">"
99
+ },
100
+ {
101
+ "type": "Replace",
102
+ "pattern": {
103
+ "String": "<"
104
+ },
105
+ "content": "<"
106
+ }
107
+ ]
108
  },
109
  "pre_tokenizer": {
110
  "type": "BertPreTokenizer"