akiFQC commited on
Commit
63782c6
1 Parent(s): fc0a8a8

update tokenizer

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- pipeline_tag: feature-extraction
3
  language: ja
4
  license: cc-by-sa-4.0
5
  tags:
1
  ---
2
+ pipeline_tag: sentence-similarity
3
  language: ja
4
  license: cc-by-sa-4.0
5
  tags:
README_JA.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- pipeline_tag: feature-extraction
3
  language: ja
4
  license: cc-by-sa-4.0
5
  tags:
1
  ---
2
+ pipeline_tag: sentence-similarity
3
  language: ja
4
  license: cc-by-sa-4.0
5
  tags:
config.json CHANGED
@@ -1,10 +1,8 @@
1
  {
2
- "_name_or_path": "cl-tohoku/bert-base-japanese-v2",
3
  "architectures": [
4
  "BertModel"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
- "classifier_dropout": null,
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.1,
10
  "hidden_size": 768,
@@ -16,11 +14,7 @@
16
  "num_attention_heads": 12,
17
  "num_hidden_layers": 12,
18
  "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
  "tokenizer_class": "BertJapaneseTokenizer",
21
- "torch_dtype": "float32",
22
- "transformers_version": "4.25.1",
23
  "type_vocab_size": 2,
24
- "use_cache": true,
25
  "vocab_size": 32768
26
  }
1
  {
 
2
  "architectures": [
3
  "BertModel"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
 
6
  "hidden_act": "gelu",
7
  "hidden_dropout_prob": 0.1,
8
  "hidden_size": 768,
14
  "num_attention_heads": 12,
15
  "num_hidden_layers": 12,
16
  "pad_token_id": 0,
 
17
  "tokenizer_class": "BertJapaneseTokenizer",
 
 
18
  "type_vocab_size": 2,
 
19
  "vocab_size": 32768
20
  }
requirements.txt DELETED
@@ -1 +0,0 @@
1
- fugashi[unidic-lite]
 
special_tokens_map.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
4
- "pad_token": "[PAD]",
5
- "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
7
- }
 
 
 
 
 
 
 
tokenizer_config.json CHANGED
@@ -2,19 +2,17 @@
2
  "do_lower_case": false,
3
  "do_subword_tokenize": true,
4
  "do_word_tokenize": true,
5
- "jumanpp_kwargs": null,
6
  "mask_token": "[MASK]",
7
  "mecab_kwargs": {
8
  "mecab_dic": "unidic_lite"
9
  },
10
  "model_max_length": 512,
11
- "name_or_path": "cl-tohoku/bert-base-japanese-v2",
12
- "never_split": null,
13
  "pad_token": "[PAD]",
 
14
  "sep_token": "[SEP]",
15
  "special_tokens_map_file": null,
 
16
  "subword_tokenizer_type": "wordpiece",
17
- "mecab_kwargs": {
18
- "mecab_dic": "unidic_lite"
19
- }
20
  }
2
  "do_lower_case": false,
3
  "do_subword_tokenize": true,
4
  "do_word_tokenize": true,
 
5
  "mask_token": "[MASK]",
6
  "mecab_kwargs": {
7
  "mecab_dic": "unidic_lite"
8
  },
9
  "model_max_length": 512,
 
 
10
  "pad_token": "[PAD]",
11
+ "cls_token": "[CLS]",
12
  "sep_token": "[SEP]",
13
  "special_tokens_map_file": null,
14
+ "name_or_path": "cl-tohoku/bert-base-japanese-v2",
15
  "subword_tokenizer_type": "wordpiece",
16
+ "unk_token": "[UNK]",
17
+ "word_tokenizer_type": "mecab"
 
18
  }