TeeA commited on
Commit
036031e
1 Parent(s): 0e2f6b6
added_tokens.json CHANGED
@@ -1,13 +1,11 @@
1
  {
2
- "</s_cols>": 57528,
3
- "</s_rows>": 57526,
4
- "</s_text>": 57530,
5
- "</s_vichart>": 57532,
6
- "<s_cols>": 57527,
7
- "<s_iitcdip>": 57523,
8
- "<s_rows>": 57525,
9
- "<s_synthdog>": 57524,
10
- "<s_text>": 57529,
11
- "<s_vichart>": 57531,
12
- "<sep/>": 57522
13
  }
 
1
  {
2
+ "</s_cols>": 40033,
3
+ "</s_rows>": 40031,
4
+ "</s_text>": 40035,
5
+ "</s_vichart>": 40037,
6
+ "<s_cols>": 40032,
7
+ "<s_rows>": 40030,
8
+ "<s_text>": 40034,
9
+ "<s_vichart>": 40036,
10
+ "<sep/>": 40038
 
 
11
  }
sentencepiece.bpe.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb9e3dce4c326195d08fc3dd0f7e2eee1da8595c847bf4c1a9c78b7a82d47e2d
3
- size 1296245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json CHANGED
@@ -1,29 +1,7 @@
1
  {
2
- "additional_special_tokens": [
3
- "<s_iitcdip>",
4
- "<s_synthdog>"
5
- ],
6
- "bos_token": {
7
- "content": "<s>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false
12
- },
13
- "cls_token": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
- "eos_token": {
21
- "content": "</s>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false
26
- },
27
  "mask_token": {
28
  "content": "<mask>",
29
  "lstrip": true,
@@ -31,25 +9,7 @@
31
  "rstrip": false,
32
  "single_word": false
33
  },
34
- "pad_token": {
35
- "content": "<pad>",
36
- "lstrip": false,
37
- "normalized": false,
38
- "rstrip": false,
39
- "single_word": false
40
- },
41
- "sep_token": {
42
- "content": "</s>",
43
- "lstrip": false,
44
- "normalized": false,
45
- "rstrip": false,
46
- "single_word": false
47
- },
48
- "unk_token": {
49
- "content": "<unk>",
50
- "lstrip": false,
51
- "normalized": false,
52
- "rstrip": false,
53
- "single_word": false
54
- }
55
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "mask_token": {
6
  "content": "<mask>",
7
  "lstrip": true,
 
9
  "rstrip": false,
10
  "single_word": false
11
  },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
tokenizer_config.json CHANGED
@@ -32,7 +32,7 @@
32
  "single_word": false,
33
  "special": true
34
  },
35
- "57521": {
36
  "content": "<mask>",
37
  "lstrip": true,
38
  "normalized": true,
@@ -40,31 +40,7 @@
40
  "single_word": false,
41
  "special": true
42
  },
43
- "57522": {
44
- "content": "<sep/>",
45
- "lstrip": false,
46
- "normalized": true,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": false
50
- },
51
- "57523": {
52
- "content": "<s_iitcdip>",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "57524": {
60
- "content": "<s_synthdog>",
61
- "lstrip": false,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- },
67
- "57525": {
68
  "content": "<s_rows>",
69
  "lstrip": false,
70
  "normalized": true,
@@ -72,7 +48,7 @@
72
  "single_word": false,
73
  "special": false
74
  },
75
- "57526": {
76
  "content": "</s_rows>",
77
  "lstrip": false,
78
  "normalized": true,
@@ -80,7 +56,7 @@
80
  "single_word": false,
81
  "special": false
82
  },
83
- "57527": {
84
  "content": "<s_cols>",
85
  "lstrip": false,
86
  "normalized": true,
@@ -88,7 +64,7 @@
88
  "single_word": false,
89
  "special": false
90
  },
91
- "57528": {
92
  "content": "</s_cols>",
93
  "lstrip": false,
94
  "normalized": true,
@@ -96,7 +72,7 @@
96
  "single_word": false,
97
  "special": false
98
  },
99
- "57529": {
100
  "content": "<s_text>",
101
  "lstrip": false,
102
  "normalized": true,
@@ -104,7 +80,7 @@
104
  "single_word": false,
105
  "special": false
106
  },
107
- "57530": {
108
  "content": "</s_text>",
109
  "lstrip": false,
110
  "normalized": true,
@@ -112,7 +88,7 @@
112
  "single_word": false,
113
  "special": false
114
  },
115
- "57531": {
116
  "content": "<s_vichart>",
117
  "lstrip": false,
118
  "normalized": true,
@@ -120,19 +96,23 @@
120
  "single_word": false,
121
  "special": false
122
  },
123
- "57532": {
124
  "content": "</s_vichart>",
125
  "lstrip": false,
126
  "normalized": true,
127
  "rstrip": false,
128
  "single_word": false,
129
  "special": false
 
 
 
 
 
 
 
 
130
  }
131
  },
132
- "additional_special_tokens": [
133
- "<s_iitcdip>",
134
- "<s_synthdog>"
135
- ],
136
  "bos_token": "<s>",
137
  "clean_up_tokenization_spaces": true,
138
  "cls_token": "<s>",
@@ -143,6 +123,6 @@
143
  "processor_class": "DonutProcessor",
144
  "sep_token": "</s>",
145
  "sp_model_kwargs": {},
146
- "tokenizer_class": "XLMRobertaTokenizer",
147
  "unk_token": "<unk>"
148
  }
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "40029": {
36
  "content": "<mask>",
37
  "lstrip": true,
38
  "normalized": true,
 
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "40030": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "content": "<s_rows>",
45
  "lstrip": false,
46
  "normalized": true,
 
48
  "single_word": false,
49
  "special": false
50
  },
51
+ "40031": {
52
  "content": "</s_rows>",
53
  "lstrip": false,
54
  "normalized": true,
 
56
  "single_word": false,
57
  "special": false
58
  },
59
+ "40032": {
60
  "content": "<s_cols>",
61
  "lstrip": false,
62
  "normalized": true,
 
64
  "single_word": false,
65
  "special": false
66
  },
67
+ "40033": {
68
  "content": "</s_cols>",
69
  "lstrip": false,
70
  "normalized": true,
 
72
  "single_word": false,
73
  "special": false
74
  },
75
+ "40034": {
76
  "content": "<s_text>",
77
  "lstrip": false,
78
  "normalized": true,
 
80
  "single_word": false,
81
  "special": false
82
  },
83
+ "40035": {
84
  "content": "</s_text>",
85
  "lstrip": false,
86
  "normalized": true,
 
88
  "single_word": false,
89
  "special": false
90
  },
91
+ "40036": {
92
  "content": "<s_vichart>",
93
  "lstrip": false,
94
  "normalized": true,
 
96
  "single_word": false,
97
  "special": false
98
  },
99
+ "40037": {
100
  "content": "</s_vichart>",
101
  "lstrip": false,
102
  "normalized": true,
103
  "rstrip": false,
104
  "single_word": false,
105
  "special": false
106
+ },
107
+ "40038": {
108
+ "content": "<sep/>",
109
+ "lstrip": false,
110
+ "normalized": true,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
  }
115
  },
 
 
 
 
116
  "bos_token": "<s>",
117
  "clean_up_tokenization_spaces": true,
118
  "cls_token": "<s>",
 
123
  "processor_class": "DonutProcessor",
124
  "sep_token": "</s>",
125
  "sp_model_kwargs": {},
126
+ "tokenizer_class": "BartphoTokenizer",
127
  "unk_token": "<unk>"
128
  }