Upload tokenizer
Browse files- tokenization_vnsabsa.py +7 -2
- tokenizer_config.json +6 -0
tokenization_vnsabsa.py
CHANGED
@@ -8,12 +8,17 @@ import regex as re
|
|
8 |
from typing import Tuple, Optional
|
9 |
import shutil
|
10 |
import os
|
|
|
11 |
|
12 |
class VnSmartphoneAbsaTokenizer(PreTrainedTokenizer):
|
|
|
|
|
|
|
|
|
13 |
def __init__(
|
14 |
self,
|
15 |
-
vocab_file
|
16 |
-
merge_file
|
17 |
bos_token="<s>",
|
18 |
eos_token="</s>",
|
19 |
sep_token="</s>",
|
|
|
8 |
from typing import Tuple, Optional
|
9 |
import shutil
|
10 |
import os
|
11 |
+
import requests
|
12 |
|
13 |
class VnSmartphoneAbsaTokenizer(PreTrainedTokenizer):
|
14 |
+
pretrained_vocab_files_map = {
|
15 |
+
"vocab_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/vocab.txt",
|
16 |
+
"merge_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/merge.txt"
|
17 |
+
}
|
18 |
def __init__(
|
19 |
self,
|
20 |
+
vocab_file,
|
21 |
+
merge_file,
|
22 |
bos_token="<s>",
|
23 |
eos_token="</s>",
|
24 |
sep_token="</s>",
|
tokenizer_config.json
CHANGED
@@ -41,6 +41,12 @@
|
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
"bos_token": "<s>",
|
45 |
"clean_up_tokenization_spaces": true,
|
46 |
"cls_token": "<s>",
|
|
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
44 |
+
"auto_map": {
|
45 |
+
"AutoTokenizer": [
|
46 |
+
"tokenization_vnsabsa.VnSmartphoneAbsaTokenizer",
|
47 |
+
null
|
48 |
+
]
|
49 |
+
},
|
50 |
"bos_token": "<s>",
|
51 |
"clean_up_tokenization_spaces": true,
|
52 |
"cls_token": "<s>",
|