paulhindemith
/

fasttext-classification

@@ -19,7 +19,9 @@
   },
   "max_length": 128,
   "model_type": "fasttext_classification",
-  "ngram": 2,
   "tokenizerI_class": "FastTextJpTokenizer",
   "tokenizer_class": "FastTextJpTokenizer",
   "torch_dtype": "float32",

   },
   "max_length": 128,
   "model_type": "fasttext_classification",
+  "ngrams": [
+    2
+  ],
   "tokenizerI_class": "FastTextJpTokenizer",
   "tokenizer_class": "FastTextJpTokenizer",
   "torch_dtype": "float32",

fasttext_fsc.py CHANGED Viewed

@@ -11,82 +11,92 @@ class FastTextForSeuqenceClassificationConfig(FastTextJpConfig):
     model_type = "fasttext_classification"
     def __init__(self,
-                 ngram: int = 2,
                  tokenizer_class="FastTextJpTokenizer",
                  **kwargs):
         """初期化処理
         Args:
-            ngram (int, optional):
-                文章を分割する際のNgram
             tokenizer_class (str, optional):
                 tokenizer_classを指定しないと、pipelineから読み込まれません。
                 config.jsonに記載されます。
         """
-        self.ngram = ngram
         kwargs["tokenizer_class"] = tokenizer_class
         super().__init__(**kwargs)
-class FastTextForSeuqenceClassification(FastTextJpModel):
-    """FastTextのベクトルをベースとした分類を行います。
-    """
-    def __init__(self, config: FastTextForSeuqenceClassificationConfig):
-        self.max_ngram = config.ngram
-        super().__init__(config)
-    def forward(self, **inputs) -> SequenceClassifierOutput:
-        """候補となるラベルから分類を行います。
         Returns:
-            SequenceClassifierOutput: 候補が正解している確率
         """
-        input_ids = inputs["input_ids"]
-        outputs = self.word_embeddings(input_ids)
-        logits = []
-        for idx in range(len(outputs)):
-            output = outputs[idx]
-            # token_type_ids == 0が文章、1がラベルです。
-            token_type_ids = inputs["token_type_ids"][idx]
-            # attention_mask == 1がパディングでないもの
-            attention_mask = inputs["attention_mask"][idx]
-            sentence = output[torch.logical_and(token_type_ids == 0,
-                                                attention_mask == 1)]
-            candidate_label = output[torch.logical_and(token_type_ids == 1,
-                                                       attention_mask == 1)]
-            sentence_words = self.split_ngram(sentence, self.max_ngram)
-            candidate_label_mean = torch.mean(candidate_label,
-                                              dim=-2,
-                                              keepdim=True)
-            p = self.cosine_similarity(sentence_words, candidate_label_mean)
-            logits.append([torch.log(p), -torch.inf, torch.log(1 - p)])
-        logits = torch.FloatTensor(logits)
-        return SequenceClassifierOutput(
-            loss=None,
-            logits=logits,
-            hidden_states=None,
-            attentions=None,
-        )
     def cosine_similarity(
-            self, sentence_words: TensorType["words", "vectors"],
-            candidate_label_means: TensorType[1, "vectors"]) -> TensorType[1]:
         res = torch.tensor(0.)
-        for i in range(len(sentence_words)):
-            sw = sentence_words[i]
             p = torch.nn.functional.cosine_similarity(sw,
-                                                      candidate_label_means[0],
                                                       dim=0)
             if p > res:
                 res = p
         return res
-    def split_ngram(self, sentences: TensorType["word", "vectors"],
-                    n: int) -> TensorType["word", "vectors"]:
         res = []
         if len(sentences) <= n:
             return torch.stack([torch.mean(sentences, dim=0, keepdim=False)])
@@ -96,6 +106,136 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
         return torch.stack(res)
 # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
 # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
 FastTextForSeuqenceClassificationConfig.register_for_auto_class()

     model_type = "fasttext_classification"
     def __init__(self,
+                 ngram: int | list[int] = 2,
                  tokenizer_class="FastTextJpTokenizer",
                  **kwargs):
         """初期化処理
         Args:
+            ngram (int | list[int], optional):
+                文章を分割する際のNgram。
             tokenizer_class (str, optional):
                 tokenizer_classを指定しないと、pipelineから読み込まれません。
                 config.jsonに記載されます。
         """
+        if isinstance(ngram, int):
+            self.ngrams = [ngram]
+        elif isinstance(ngram, list):
+            self.ngrams = ngram
+        else:
+            raise TypeError(f"got unknown type {type(ngram)}")
         kwargs["tokenizer_class"] = tokenizer_class
         super().__init__(**kwargs)
+class NgramForSeuqenceClassification():
+    def __init__(self):
+        ...
+    def __call__(self, sentence: TensorType["A", "vectors"],
+                 candidate_label: TensorType["B", "vectors"],
+                 ngram: int) -> TensorType[3]:
+        """Ngramで文章を分けてコサイン類似度を算出する。
+        Args:
+            sentence (TensorType["A", "vectors"]): 文章ベクトル
+            candidate_label (TensorType["B", "vectors"]): ラベルベクトル
+            ngram (int): Ngram
         Returns:
+            TensorType[3]:
+                文章の類似度。[Entailment, Neutral, Contradiction]
         """
+        sentence_ngrams = self.split_ngram(sentence, ngram)
+        candidate_label_mean = torch.mean(candidate_label, dim=0, keepdim=True)
+        p = self.cosine_similarity(sentence_ngrams, candidate_label_mean)
+        return torch.tensor([torch.log(p), -torch.inf, torch.log(1 - p)])
     def cosine_similarity(
+            self, sentence_ngrams: TensorType["ngrams", "vectors"],
+            candidate_label_mean: TensorType[1, "vectors"]) -> TensorType[1]:
+        """コサイン類似度を計算する。
+        Args:
+            sentence_ngrams (TensorType["ngrams", "vectors"]):
+                Ngram化された文章ベクトル
+            candidate_label_mean (TensorType[1, "vectors"]):
+                ラベルベクトル
+        Returns:
+            TensorType[1]: _description_
+        """
         res = torch.tensor(0.)
+        for i in range(len(sentence_ngrams)):
+            sw = sentence_ngrams[i]
             p = torch.nn.functional.cosine_similarity(sw,
+                                                      candidate_label_mean[0],
                                                       dim=0)
             if p > res:
                 res = p
         return res
+    def split_ngram(self, sentences: TensorType["A", "vectors"],
+                    n: int) -> TensorType["ngrams", "vectors"]:
+        """AとBの関連度を計算します。
+        Args:
+            sentences(TensorType["A", "vectors"]):
+                対象の文章
+            n(int):
+                ngram
+        Returns:
+            TensorType["ngrams", "vectors"]:
+                Ngram化された文章
+        """
         res = []
         if len(sentences) <= n:
             return torch.stack([torch.mean(sentences, dim=0, keepdim=False)])
         return torch.stack(res)
+class NgramsForSeuqenceClassification():
+    def __init__(self, config: FastTextForSeuqenceClassificationConfig):
+        self.max_ngrams = config.ngrams
+        self.ngram_layer = NgramForSeuqenceClassification()
+    def __call__(self, sentence: TensorType["A", "vectors"],
+                 candidate_label: TensorType["B", "vectors"]) -> TensorType[3]:
+        """AとBの関連度を計算します。
+        Args:
+            sentence(TensorType["A", "vectors"]):
+                対象の文章
+            candidate_label(TensorType["B", "vectors"]):
+                ラベルの文章
+        Returns:
+            TensorType[3]:
+                文章の類似度。[Entailment, Neutral, Contradiction]
+        """
+        res = [-torch.inf, -torch.inf, -torch.inf]
+        for ngram in self.max_ngrams:
+            logit = self.ngram_layer(sentence, candidate_label, ngram)
+            if logit[0] > res[0]:
+                res = logit
+        return torch.tensor(res)
+class BatchedNgramsForSeuqenceClassification():
+    def __init__(self, config: FastTextForSeuqenceClassificationConfig):
+        self.ngrams_layer = NgramsForSeuqenceClassification(config)
+    def __call__(
+        self,
+        last_hidden_state: TensorType["batch", "A+B", "vectors"],
+        token_type_ids: TensorType["batch", "A+B"],
+        attention_mask: TensorType["batch", "A+B"],
+    ) -> TensorType["batch", 3]:
+        """AとBの関連度を計算します。
+        Args:
+            last_hidden_state(TensorType["batch", "A+B", "vectors"]):
+                embeddingsの値。
+            token_type_ids(TensorType["A+B"]):
+                文章のid。0か1で、Bの場合1。
+            attention_mask(TensorType["A+B"]):
+                padを識別する。0か1で、padの場合1。
+        Returns:
+            TensorType["batch", 3]:
+                文章の類似度。[Entailment, Neutral, Contradiction]
+        """
+        logits = []
+        embeddings = last_hidden_state
+        for idx in range(len(embeddings)):
+            vec = embeddings[idx]
+            # token_type_ids == 0が文章、1がラベルです。
+            token_type_ids = token_type_ids[idx]
+            # attention_mask == 1がパディングでないもの
+            attention_mask = attention_mask[idx]
+            sentence, candidate_label = self.split_sentence(
+                vec, token_type_ids, attention_mask)
+            logit = self.ngrams_layer(sentence, candidate_label)
+            logits.append(logit)
+        logits = torch.tensor(logits)
+        return logits
+    def split_sentence(
+        self, vec: TensorType["A+B", "vectors"],
+        token_type_ids: TensorType["A+B"], attention_mask: TensorType["A+B"]
+    ) -> tuple[TensorType["A", "vectors"], TensorType["B", "vectors"]]:
+        """CrossEncoderになっているので、文章を分割します。
+        Args:
+            vec(TensorType["A+B","vectors"]):
+                単語ベクトル
+            token_type_ids(TensorType["A+B"]):
+                文章のid。0か1で、Bの場合1。
+            attention_mask(TensorType["A+B"]):
+                padを識別する。0か1で、padの場合1。
+        Returns:
+            tuple[TensorType["A", "vectors"], TensorType["B", "vectors"]]:
+                AとBの文章を分割して返します。
+        """
+        sentence = vec[torch.logical_and(token_type_ids == 0,
+                                         attention_mask == 1)]
+        candidate_label = vec[torch.logical_and(token_type_ids == 1,
+                                                attention_mask == 1)]
+        return sentence, candidate_label
+class FastTextForSeuqenceClassification(FastTextJpModel):
+    """FastTextのベクトルをベースとした分類を行います。
+    """
+    def __init__(self, config: FastTextForSeuqenceClassificationConfig):
+        self.layer = BatchedNgramsForSeuqenceClassification(config)
+        super().__init__(config)
+    def forward(
+        self,
+        input_ids: TensorType["batch", "A+B", "vecotors"] = None,
+        attention_mask: TensorType["batch", "A+B"] = None,
+        token_type_ids: TensorType["batch", "A+B"] = None
+    ) -> SequenceClassifierOutput:
+        """候補となるラベルから分類を行います。
+        Returns:
+            SequenceClassifierOutput: 候補が正解している確率
+        """
+        outputs = self.word_embeddings(input_ids)
+        logits = self.layer(last_hidden_state=outputs,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids)
+        return SequenceClassifierOutput(
+            loss=None,
+            logits=logits,
+            hidden_states=None,
+            attentions=None,
+        )
 # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
 # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
 FastTextForSeuqenceClassificationConfig.register_for_auto_class()