gxy commited on
Commit
e9eeeea
1 Parent(s): 860c257

ADD: add fast tokenizer

Browse files
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"<pad>": 40000, "<mask>": 40001}
 
1
+ {"<mask>": 40001, "<pad>": 40000}
save_tokenizer.py DELETED
@@ -1,9 +0,0 @@
1
- from transformers import T5Tokenizer
2
-
3
- tokenizer = T5Tokenizer.from_pretrained(
4
- '/cognitive_comp/common_data/tokenizers/sentence_piece_bpe/bpe_v40000_s42_cov0.9995_max6_corpus1M.model',
5
- additional_special_tokens=['<s>', '<mask>'],
6
- extra_ids=0)
7
- tokenizer.bos_token = '<s>'
8
- tokenizer.mask_token = '<mask>'
9
- tokenizer.save_pretrained('/cognitive_comp/gaoxinyu/pretrained_model/bart-base')
 
 
 
 
 
 
 
 
 
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -8,6 +8,7 @@
8
  "<mask>"
9
  ],
10
  "sp_model_kwargs": {},
11
- "name_or_path": "/cognitive_comp/common_data/tokenizers/sentence_piece_bpe/bpe_v40000_s42_cov0.9995_max6_corpus1M.model",
 
12
  "tokenizer_class": "T5Tokenizer"
13
  }
 
8
  "<mask>"
9
  ],
10
  "sp_model_kwargs": {},
11
+ "name_or_path": "/cognitive_comp/gaoxinyu/hf_hub/Randeng-BART-139M",
12
+ "special_tokens_map_file": "/cognitive_comp/gaoxinyu/hf_hub/Randeng-BART-139M/special_tokens_map.json",
13
  "tokenizer_class": "T5Tokenizer"
14
  }