beomi commited on
Commit
f000713
1 Parent(s): e47e807

Add Support for AutoTokenizer and Pipeline

Browse files
Files changed (3) hide show
  1. README.md +7 -1
  2. config.json +2 -2
  3. tokenizer_config.json +3 -2
README.md CHANGED
@@ -4,6 +4,12 @@ language: ko
4
 
5
  # Bert base model for Korean
6
 
 
 
 
 
 
 
7
  * 70GB Korean text dataset and 42000 lower-cased subwords are used
8
  * Check the model performance and other language models for Korean in [github](https://github.com/kiyoungkim1/LM-kor)
9
 
@@ -13,4 +19,4 @@ tokenizer_gpt3 = BertTokenizerFast.from_pretrained("kykim/gpt3-kor-small_based_o
13
  input_ids = tokenizer_gpt3.encode("text to tokenize")[1:] # remove cls token
14
 
15
  model_gpt3 = GPT2LMHeadModel.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")
16
- ```
 
4
 
5
  # Bert base model for Korean
6
 
7
+ ## Update
8
+
9
+ - Update at 2021.11.17 : Add Native Support for BERT Tokenizer (works with AutoTokenizer, pipeline)
10
+
11
+ ---
12
+
13
  * 70GB Korean text dataset and 42000 lower-cased subwords are used
14
  * Check the model performance and other language models for Korean in [github](https://github.com/kiyoungkim1/LM-kor)
15
 
 
19
  input_ids = tokenizer_gpt3.encode("text to tokenize")[1:] # remove cls token
20
 
21
  model_gpt3 = GPT2LMHeadModel.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")
22
+ ```
config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "activation_function": "gelu_new",
3
  "architectures": [
4
- "GPT2Model"
5
  ],
6
  "attn_pdrop": 0.1,
7
  "bos_token_id": 3,
@@ -25,4 +25,4 @@
25
  "summary_use_proj": true,
26
  "use_cache": true,
27
  "vocab_size": 42000
28
- }
 
1
  {
2
  "activation_function": "gelu_new",
3
  "architectures": [
4
+ "GPT2LMHeadModel"
5
  ],
6
  "attn_pdrop": 0.1,
7
  "bos_token_id": 3,
 
25
  "summary_use_proj": true,
26
  "use_cache": true,
27
  "vocab_size": 42000
28
+ }
tokenizer_config.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "do_lower_case": true,
3
  "strip_accents": false,
4
- "model_max_length": 2048
5
- }
 
 
1
  {
2
  "do_lower_case": true,
3
  "strip_accents": false,
4
+ "model_max_length": 2048,
5
+ "tokenizer_class": "BertTokenizer"
6
+ }