paulhindemith
/

fasttext-classification

@@ -3,7 +3,7 @@
     "FastTextForSeuqenceClassification"
   ],
   "auto_map": {
-    "AutoConfig": "fasttext_jp_embedding.FastTextJpConfig",
     "AutoModelForSequenceClassification": "fasttext_fsc.FastTextForSeuqenceClassification"
   },
   "hidden_size": 300,
@@ -19,9 +19,10 @@
   },
   "max_length": 128,
   "model_type": "fasttext_jp",
   "tokenizerI_class": "FastTextJpTokenizer",
   "tokenizer_class": "FastTextJpTokenizer",
   "torch_dtype": "float32",
   "transformers_version": "4.23.1",
-  "vocab_size": 500
 }

     "FastTextForSeuqenceClassification"
   ],
   "auto_map": {
+    "AutoConfig": "fasttext_fsc.FastTextForSeuqenceClassificationConfig",
     "AutoModelForSequenceClassification": "fasttext_fsc.FastTextForSeuqenceClassification"
   },
   "hidden_size": 300,
   },
   "max_length": 128,
   "model_type": "fasttext_jp",
+  "ngram": 2,
   "tokenizerI_class": "FastTextJpTokenizer",
   "tokenizer_class": "FastTextJpTokenizer",
   "torch_dtype": "float32",
   "transformers_version": "4.23.1",
+  "vocab_size": 2000000
 }

fasttext_fsc.py CHANGED Viewed

@@ -7,51 +7,96 @@ from .fasttext_jp_embedding import FastTextJpModel, FastTextJpConfig
 from transformers.modeling_outputs import SequenceClassifierOutput
 class FastTextForSeuqenceClassification(FastTextJpModel):
     """FastTextのベクトルをベースとした分類を行います。
     """
-    def __init__(self, config: FastTextJpConfig):
         super().__init__(config)
     def forward(self, **inputs) -> SequenceClassifierOutput:
-        """embeddingを行います。
         Returns:
-            TensorType["batch", "word", "vectors"]: 単語ごとにベクトルを返します。
         """
         input_ids = inputs["input_ids"]
         outputs = self.word_embeddings(input_ids)
-        sentence = outputs[torch.logical_and(inputs["attention_mask"] == 1,
-                                             inputs["token_type_ids"] == 0)]
-        candidate_label = outputs[torch.logical_and(
-            inputs["attention_mask"] == 1, inputs["token_type_ids"] == 1)]
-        sentence_mean = torch.mean(sentence, dim=-2, keepdim=True)
-        candidate_label_mean = torch.mean(candidate_label,
-                                          dim=-2,
-                                          keepdim=True)
-        if sentence_mean.dim() == 2:
-            p = torch.nn.functional.cosine_similarity(sentence_mean,
-                                                      candidate_label_mean,
-                                                      dim=1)
-            logits = [[torch.log(p), -torch.inf, torch.log(1 - p)]]
-        else:
-            logits = []
-            # batch
-            for sm, clm in zip(sentence_mean, candidate_label_mean):
-                p = torch.nn.functional.cosine_similarity(sm, clm, dim=1)
-                logits.append([[torch.log(p), -torch.inf, torch.log(1 - p)]])
         logits = torch.FloatTensor(logits)
         return SequenceClassifierOutput(
             loss=None,
-            logits=logits,  # type: ignore
             hidden_states=None,
             attentions=None,
         )
 # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
 # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
 FastTextForSeuqenceClassification.register_for_auto_class(
     "AutoModelForSequenceClassification")

 from transformers.modeling_outputs import SequenceClassifierOutput
+class FastTextForSeuqenceClassificationConfig(FastTextJpConfig):
+    """FastTextJpModelのConfig
+    """
+    model_type = "fasttext_jp"
+    def __init__(self,
+                 ngram: int = 2,
+                 tokenizer_class="FastTextJpTokenizer",
+                 **kwargs):
+        """初期化処理
+        Args:
+            ngram (int, optional):
+                文章を分割する際のNgram
+            tokenizer_class (str, optional):
+                tokenizer_classを指定しないと、pipelineから読み込まれません。
+                config.jsonに記載されます。
+        """
+        self.ngram = ngram
+        kwargs["tokenizer_class"] = tokenizer_class
+        super().__init__(**kwargs)
 class FastTextForSeuqenceClassification(FastTextJpModel):
     """FastTextのベクトルをベースとした分類を行います。
     """
+    def __init__(self, config: FastTextForSeuqenceClassificationConfig):
+        self.ngram = config.ngram
         super().__init__(config)
     def forward(self, **inputs) -> SequenceClassifierOutput:
+        """候補となるラベルから分類を行います。
         Returns:
+            SequenceClassifierOutput: 候補が正解している確率
         """
         input_ids = inputs["input_ids"]
         outputs = self.word_embeddings(input_ids)
+        logits = []
+        for idx in range(len(outputs)):
+            output = outputs[idx]
+            # token_type_ids == 0が文章、1がラベルです。
+            token_type_ids = inputs["token_type_ids"][idx]
+            # attention_mask == 1がパディングでないもの
+            attention_mask = inputs["attention_mask"][idx]
+            sentence = output[torch.logical_and(token_type_ids == 0,
+                                                attention_mask == 1)]
+            candidate_label = output[torch.logical_and(token_type_ids == 1,
+                                                       attention_mask == 1)]
+            sentence_words = self.split_ngram(sentence, self.ngram)
+            candidate_label_mean = torch.mean(candidate_label,
+                                              dim=-2,
+                                              keepdim=True)
+            p = self.cosine_similarity(sentence_words, candidate_label_mean)
+            logits.append([torch.log(p), -torch.inf, torch.log(1 - p)])
         logits = torch.FloatTensor(logits)
         return SequenceClassifierOutput(
             loss=None,
+            logits=logits,
             hidden_states=None,
             attentions=None,
         )
+    def cosine_similarity(
+            self, sentence_words: TensorType["words", "vectors"],
+            candidate_label_means: TensorType[1, "vectors"]) -> TensorType[1]:
+        res = torch.tensor(0.)
+        for sw in sentence_words:
+            p = torch.nn.functional.cosine_similarity(sw,
+                                                      candidate_label_means[0],
+                                                      dim=0)
+            if p > res:
+                res = p
+        return res
+    def split_ngram(self, sentences: TensorType["word", "vectors"],
+                    n: int) -> TensorType["word", "vectors"]:
+        res = []
+        for i in range(len(sentences) - n + 1):
+            ngram = sentences[i:i + n]
+            res.append(torch.mean(ngram, dim=0, keepdim=False))
+        return torch.stack(res)
 # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
 # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
+FastTextForSeuqenceClassificationConfig.register_for_auto_class()
 FastTextForSeuqenceClassification.register_for_auto_class(
     "AutoModelForSequenceClassification")

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a56d3c08a7e47dbe704cf7eb322380ae83c6b8d2becea81b4ae03da9486c026
-size 600829

 version https://git-lfs.github.com/spec/v1
+oid sha256:ba58a6e9bba7142a3d3507fc094345ae2e5ebb222fe98cdf5b2146487895314e
+size 2400000829