ptdat commited on
Commit
92a1809
·
verified ·
1 Parent(s): 2a311ca

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenization_vnsabsa.py +7 -2
  2. tokenizer_config.json +6 -0
tokenization_vnsabsa.py CHANGED
@@ -8,12 +8,17 @@ import regex as re
8
  from typing import Tuple, Optional
9
  import shutil
10
  import os
 
11
 
12
  class VnSmartphoneAbsaTokenizer(PreTrainedTokenizer):
 
 
 
 
13
  def __init__(
14
  self,
15
- vocab_file="vocab.txt",
16
- merge_file="merge.txt",
17
  bos_token="<s>",
18
  eos_token="</s>",
19
  sep_token="</s>",
 
8
  from typing import Tuple, Optional
9
  import shutil
10
  import os
11
+ import requests
12
 
13
  class VnSmartphoneAbsaTokenizer(PreTrainedTokenizer):
14
+ pretrained_vocab_files_map = {
15
+ "vocab_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/vocab.txt",
16
+ "merge_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/merge.txt"
17
+ }
18
  def __init__(
19
  self,
20
+ vocab_file,
21
+ merge_file,
22
  bos_token="<s>",
23
  eos_token="</s>",
24
  sep_token="</s>",
tokenizer_config.json CHANGED
@@ -41,6 +41,12 @@
41
  "special": true
42
  }
43
  },
 
 
 
 
 
 
44
  "bos_token": "<s>",
45
  "clean_up_tokenization_spaces": true,
46
  "cls_token": "<s>",
 
41
  "special": true
42
  }
43
  },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenization_vnsabsa.VnSmartphoneAbsaTokenizer",
47
+ null
48
+ ]
49
+ },
50
  "bos_token": "<s>",
51
  "clean_up_tokenization_spaces": true,
52
  "cls_token": "<s>",