Taizo Kaneko commited on
Commit
97c46f0
1 Parent(s): 3b2e896

commit files to HF hub

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "FastTextForSeuqenceClassification"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "fasttext_jp_embedding.FastTextJpConfig",
7
+ "AutoModel": "fasttext_fsc.FastTextForSeuqenceClassification"
8
+ },
9
+ "hidden_size": 300,
10
+ "id2label": {
11
+ "0": "entailment",
12
+ "1": "neutral",
13
+ "2": "contradiction"
14
+ },
15
+ "label2id": {
16
+ "contradiction": 2,
17
+ "entailment": 0,
18
+ "neutral": 1
19
+ },
20
+ "max_length": 128,
21
+ "model_type": "fasttext_jp",
22
+ "tokenizerI_class": "FastTextJpTokenizer",
23
+ "tokenizer_class": "FastTextJpTokenizer",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.23.1",
26
+ "vocab_size": 500
27
+ }
fasttext_fsc.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from transformers import PretrainedConfig
3
+ from torch import nn
4
+ import torch
5
+ from torchtyping import TensorType
6
+ from .fasttext_jp_embedding import FastTextJpModel, FastTextJpConfig
7
+ from transformers.modeling_outputs import SequenceClassifierOutput
8
+
9
+
10
+ class FastTextForSeuqenceClassification(FastTextJpModel):
11
+ """FastTextのベクトルをベースとした分類を行います。
12
+ """
13
+
14
+ def __init__(self, config: FastTextJpConfig):
15
+ super().__init__(config)
16
+
17
+ def forward(self, **inputs) -> SequenceClassifierOutput:
18
+ """embeddingを行います。
19
+
20
+ Returns:
21
+ TensorType["batch", "word", "vectors"]: 単語ごとにベクトルを返します。
22
+ """
23
+ input_ids = inputs["input_ids"]
24
+ outputs = self.word_embeddings(input_ids)
25
+ sentence = outputs[torch.logical_and(inputs["attention_mask"] == 1,
26
+ inputs["token_type_ids"] == 0)]
27
+ candidate_label = outputs[torch.logical_and(
28
+ inputs["attention_mask"] == 1, inputs["token_type_ids"] == 1)]
29
+
30
+ sentence_mean = torch.mean(sentence, dim=-2, keepdim=True)
31
+ candidate_label_mean = torch.mean(candidate_label,
32
+ dim=-2,
33
+ keepdim=True)
34
+ if sentence_mean.dim() == 2:
35
+ p = torch.nn.functional.cosine_similarity(sentence_mean,
36
+ candidate_label_mean,
37
+ dim=1)
38
+ logits = [[torch.log(p), -torch.inf, torch.log(1 - p)]]
39
+ else:
40
+ logits = []
41
+ # batch
42
+ for sm, clm in zip(sentence_mean, candidate_label_mean):
43
+ p = torch.nn.functional.cosine_similarity(sm, clm, dim=1)
44
+ logits.append([[torch.log(p), -torch.inf, torch.log(1 - p)]])
45
+ logits = torch.FloatTensor(logits)
46
+ return SequenceClassifierOutput(
47
+ loss=None,
48
+ logits=logits, # type: ignore
49
+ hidden_states=None,
50
+ attentions=None,
51
+ )
52
+
53
+
54
+ # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
55
+ # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
56
+ FastTextForSeuqenceClassification.register_for_auto_class("AutoModel")
fasttext_jp_embedding.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from transformers import PretrainedConfig
3
+ from transformers import PreTrainedModel
4
+ from torch import nn
5
+ import torch
6
+ from torchtyping import TensorType
7
+
8
+
9
+ class FastTextJpConfig(PretrainedConfig):
10
+ """FastTextJpModelのConfig
11
+ """
12
+ model_type = "fasttext_jp"
13
+
14
+ def __init__(self, tokenizer_class="FastTextJpTokenizer", **kwargs):
15
+ """初期化処理
16
+
17
+ Args:
18
+ tokenizer_class (str, optional):
19
+ tokenizer_classを指定しないと、pipelineから読み込まれません。
20
+ config.jsonに記載されます。
21
+ """
22
+ kwargs["tokenizer_class"] = tokenizer_class
23
+ super().__init__(**kwargs)
24
+
25
+
26
+ class FastTextJpModel(PreTrainedModel):
27
+ """FastTextのEmbeddingを行います。
28
+ """
29
+ config_class = FastTextJpConfig
30
+
31
+ def __init__(self, config: FastTextJpConfig):
32
+ super().__init__(config)
33
+ self.word_embeddings = nn.Embedding(config.vocab_size,
34
+ config.hidden_size)
35
+
36
+ def forward(self, **inputs) -> TensorType["batch", "word", "vectors"]:
37
+ """embeddingを行います。
38
+
39
+ Returns:
40
+ TensorType["batch", "word", "vectors"]: 単語ごとにベクトルを返します。
41
+ """
42
+ return self.word_embeddings(torch.Tensor(inputs["input_ids"]))
43
+
44
+
45
+ # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
46
+ # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
47
+ FastTextJpConfig.register_for_auto_class()
48
+ FastTextJpModel.register_for_auto_class("AutoModel")
fasttext_jp_tokenizer.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from .mecab_tokenizer import MeCabTokenizer
3
+ import os
4
+
5
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
6
+
7
+
8
+ def save_stoi(stoi: dict[str, int], vocab_file: str):
9
+ """単語IDの辞書を配列にしてvocab_fileに保存します。
10
+
11
+ Args:
12
+ stoi (dict[str, int]): 単語IDのマッピング
13
+ vocab_file (str): 保存するパス
14
+
15
+ Raises:
16
+ ValueError: IDが途切れているとエラーを起こします。
17
+ """
18
+
19
+ with open(vocab_file, "w", encoding="utf-8") as writer:
20
+ index = 0
21
+ for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
22
+ if index != token_index:
23
+ raise ValueError(
24
+ "Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
25
+ " Please check that the vocabulary is not corrupted!")
26
+ writer.write(token + "\n")
27
+ index += 1
28
+
29
+
30
+ def load_stoi(vocab_file: str) -> dict[str, int]:
31
+ """ファイルから単語IDの辞書をロードします。
32
+
33
+ Args:
34
+ vocab_file (str): ファイルのパス
35
+
36
+ Returns:
37
+ dict[str, int]: 単語IDのマッピング
38
+ """
39
+
40
+ stoi: dict[str, int] = {}
41
+ # ファイルから読み出し
42
+ with open(vocab_file, "r", encoding="utf-8") as reader:
43
+ tokens = reader.readlines()
44
+
45
+ # 単語IDのマッピングを生成します。
46
+ for index, token in enumerate(tokens):
47
+ token = token.rstrip("\n")
48
+ stoi[token] = index
49
+ return stoi
50
+
51
+
52
+ class FastTextJpTokenizer(MeCabTokenizer):
53
+
54
+ # Configが認識するのに必要です。
55
+ # https://huggingface.co/docs/transformers/custom_models#writing-a-custom-configuration
56
+ model_type = "fasttext_jp"
57
+
58
+ # vocab.txtを認識するのにおそらく必要。
59
+ vocab_files_names = VOCAB_FILES_NAMES
60
+
61
+ def __init__(self,
62
+ vocab_file: str,
63
+ hinshi: list[str] | None = None,
64
+ mecab_dicdir: str | None = None,
65
+ **kwargs):
66
+ """初期化処理
67
+
68
+ Args:
69
+ vocab_file (str): vocab_fileのpath
70
+ hinshi (list[str] | None, optional): 抽出する品詞
71
+ mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
72
+ """
73
+ super().__init__(hinshi, mecab_dicdir, **kwargs)
74
+
75
+ if not os.path.isfile(vocab_file):
76
+ raise ValueError(
77
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
78
+ " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
79
+ )
80
+ self.stoi = load_stoi(vocab_file)
81
+ self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
82
+
83
+ @property
84
+ def vocab_size(self) -> int:
85
+ """ボキャブラリのサイズ
86
+ ※PreTrainedTokenizerで実装すべき必須の関数。
87
+
88
+ Returns:
89
+ int: ボキャブラリのサイズ
90
+ """
91
+ return len(self.stoi)
92
+
93
+ def _convert_token_to_id(self, token: str) -> int:
94
+ """単語からID
95
+ ※PreTrainedTokenizerで実装すべき必須の関数。
96
+
97
+ Args:
98
+ token (str): 単語
99
+
100
+ Returns:
101
+ int: ID
102
+ """
103
+ return self.stoi[token]
104
+
105
+ def _convert_id_to_token(self, index: int) -> str:
106
+ """IDから単語
107
+ ※PreTrainedTokenizerで実装すべき必須の関数。
108
+
109
+ Args:
110
+ index (int): ID
111
+
112
+ Returns:
113
+ str: 単語
114
+ """
115
+ return self.itos[index]
116
+
117
+ def save_vocabulary(self,
118
+ save_directory: str,
119
+ filename_prefix: str | None = None) -> tuple[str]:
120
+ """ボキャブラリの保存
121
+
122
+ Args:
123
+ save_directory (str): 保存するディレクトリ。ファイル名はvocab.txtに固定
124
+ filename_prefix (str | None, optional): ファイルのprefix
125
+
126
+ Returns:
127
+ tuple[str]: ファイル名を返す。
128
+ """
129
+ if os.path.isdir(save_directory):
130
+ vocab_file = os.path.join(
131
+ save_directory,
132
+ (filename_prefix + "-" if filename_prefix else "") +
133
+ VOCAB_FILES_NAMES["vocab_file"])
134
+ else:
135
+ vocab_file = (filename_prefix +
136
+ "-" if filename_prefix else "") + save_directory
137
+ save_stoi(self.stoi, vocab_file)
138
+ return (vocab_file, )
139
+
140
+
141
+ # AutoTokenizerに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
142
+ # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
143
+ FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")
mecab_tokenizer.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import NamedTuple
3
+ import MeCab
4
+ from transformers import PreTrainedTokenizer
5
+
6
+
7
+ class MeCabResult(NamedTuple):
8
+ """MeCab解析結果の型
9
+ """
10
+ hyosokei: str
11
+ hinshi: str
12
+ hinshi_saibunrui_1: str
13
+ hinshi_saibunrui_2: str
14
+ hinshi_saibunrui_3: str
15
+ katsuyokei_1: str
16
+ katsuyokei_2: str
17
+ genkei: str
18
+ yomi: str
19
+ hatsuon: str
20
+
21
+
22
+ class MeCabTokenizer(PreTrainedTokenizer):
23
+
24
+ def __init__(self,
25
+ hinshi: list[str] | None = None,
26
+ mecab_dicdir: str | None = None,
27
+ **kwargs):
28
+ """初期化処理
29
+
30
+ Args:
31
+ hinshi (list[str] | None): 抽出する品詞
32
+ mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
33
+ """
34
+
35
+ self.target_hinshi = hinshi
36
+ if mecab_dicdir is not None:
37
+ self.mecab = MeCab.Tagger(f"-d {mecab_dicdir}")
38
+ else:
39
+ self.mecab = MeCab.Tagger()
40
+
41
+ super().__init__(**kwargs)
42
+
43
+ def _tokenize(self, text: str) -> list[str]:
44
+ """文章から特定の品詞の単語を返します。
45
+
46
+ Args:
47
+ text (str): 文章
48
+
49
+ Returns:
50
+ list[str]: 特定の品詞の単語
51
+ """
52
+
53
+ out = []
54
+ # Mecabで分析します。
55
+ result_words = self.mecab_analyze(text)
56
+ for result_word in result_words:
57
+ # 最初と最後は空文字
58
+ if result_word.hyosokei == "":
59
+ continue
60
+ if self.target_hinshi is not None:
61
+ if result_word.hinshi in self.target_hinshi:
62
+ # 特定の品詞のみ返します。
63
+ out.append(result_word.hyosokei)
64
+ else:
65
+ continue
66
+ else:
67
+ out.append(result_word.hyosokei)
68
+ return out
69
+
70
+ def mecab_analyze(self, text: str) -> list[MeCabResult]:
71
+ """文章をMecabで分析します。
72
+
73
+ Args:
74
+ text (str): 文章
75
+
76
+ Returns:
77
+ list[MeCabResult]: MeCabの解析結果
78
+ """
79
+ node = self.mecab.parseToNode(text)
80
+ #形態素1つ1つを処理
81
+ out = []
82
+ while node:
83
+ args = []
84
+ args.append(node.surface)
85
+ feature = node.feature.split(",")
86
+ args.extend(feature)
87
+ mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
88
+ args[4], args[5], args[6], args[7],
89
+ args[8], args[9])
90
+ out.append(mecab_result)
91
+ node = node.next # 最後のEOSを省く
92
+ return out
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bd689ac294c1623aa08045af576207197ec480898bbb9f4057b062f63cfdf4f
3
+ size 600829
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pad_token": "*",
3
+ "unk_token": "*"
4
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "fasttext_jp_tokenizer.FastTextJpTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "model_max_length": 128,
9
+ "pad_token": "*",
10
+ "tokenizer_class": "FastTextJpTokenizer",
11
+ "unk_token": "*"
12
+ }
vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a1770ed0a47f44e882afc3f56271a16bc8dba675f18dd61e2cffac276b49acc
3
+ size 29910902