Taizo Kaneko commited on
Commit
7fe102c
1 Parent(s): 67a2f9a

commit files to HF hub

Browse files
Files changed (3) hide show
  1. config.json +3 -2
  2. fasttext_fsc.py +69 -24
  3. pytorch_model.bin +2 -2
config.json CHANGED
@@ -3,7 +3,7 @@
3
  "FastTextForSeuqenceClassification"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": "fasttext_jp_embedding.FastTextJpConfig",
7
  "AutoModelForSequenceClassification": "fasttext_fsc.FastTextForSeuqenceClassification"
8
  },
9
  "hidden_size": 300,
@@ -19,9 +19,10 @@
19
  },
20
  "max_length": 128,
21
  "model_type": "fasttext_jp",
 
22
  "tokenizerI_class": "FastTextJpTokenizer",
23
  "tokenizer_class": "FastTextJpTokenizer",
24
  "torch_dtype": "float32",
25
  "transformers_version": "4.23.1",
26
- "vocab_size": 500
27
  }
 
3
  "FastTextForSeuqenceClassification"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": "fasttext_fsc.FastTextForSeuqenceClassificationConfig",
7
  "AutoModelForSequenceClassification": "fasttext_fsc.FastTextForSeuqenceClassification"
8
  },
9
  "hidden_size": 300,
 
19
  },
20
  "max_length": 128,
21
  "model_type": "fasttext_jp",
22
+ "ngram": 2,
23
  "tokenizerI_class": "FastTextJpTokenizer",
24
  "tokenizer_class": "FastTextJpTokenizer",
25
  "torch_dtype": "float32",
26
  "transformers_version": "4.23.1",
27
+ "vocab_size": 2000000
28
  }
fasttext_fsc.py CHANGED
@@ -7,51 +7,96 @@ from .fasttext_jp_embedding import FastTextJpModel, FastTextJpConfig
7
  from transformers.modeling_outputs import SequenceClassifierOutput
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  class FastTextForSeuqenceClassification(FastTextJpModel):
11
  """FastTextのベクトルをベースとした分類を行います。
12
  """
13
 
14
- def __init__(self, config: FastTextJpConfig):
 
 
15
  super().__init__(config)
16
 
17
  def forward(self, **inputs) -> SequenceClassifierOutput:
18
- """embeddingを行います。
19
 
20
  Returns:
21
- TensorType["batch", "word", "vectors"]: 単語ごとにベクトルを返します。
22
  """
23
  input_ids = inputs["input_ids"]
24
  outputs = self.word_embeddings(input_ids)
25
- sentence = outputs[torch.logical_and(inputs["attention_mask"] == 1,
26
- inputs["token_type_ids"] == 0)]
27
- candidate_label = outputs[torch.logical_and(
28
- inputs["attention_mask"] == 1, inputs["token_type_ids"] == 1)]
29
-
30
- sentence_mean = torch.mean(sentence, dim=-2, keepdim=True)
31
- candidate_label_mean = torch.mean(candidate_label,
32
- dim=-2,
33
- keepdim=True)
34
- if sentence_mean.dim() == 2:
35
- p = torch.nn.functional.cosine_similarity(sentence_mean,
36
- candidate_label_mean,
37
- dim=1)
38
- logits = [[torch.log(p), -torch.inf, torch.log(1 - p)]]
39
- else:
40
- logits = []
41
- # batch
42
- for sm, clm in zip(sentence_mean, candidate_label_mean):
43
- p = torch.nn.functional.cosine_similarity(sm, clm, dim=1)
44
- logits.append([[torch.log(p), -torch.inf, torch.log(1 - p)]])
45
  logits = torch.FloatTensor(logits)
46
  return SequenceClassifierOutput(
47
  loss=None,
48
- logits=logits, # type: ignore
49
  hidden_states=None,
50
  attentions=None,
51
  )
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
55
  # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
 
56
  FastTextForSeuqenceClassification.register_for_auto_class(
57
  "AutoModelForSequenceClassification")
 
7
  from transformers.modeling_outputs import SequenceClassifierOutput
8
 
9
 
10
+ class FastTextForSeuqenceClassificationConfig(FastTextJpConfig):
11
+ """FastTextJpModelのConfig
12
+ """
13
+ model_type = "fasttext_jp"
14
+
15
+ def __init__(self,
16
+ ngram: int = 2,
17
+ tokenizer_class="FastTextJpTokenizer",
18
+ **kwargs):
19
+ """初期化処理
20
+
21
+ Args:
22
+ ngram (int, optional):
23
+ 文章を分割する際のNgram
24
+ tokenizer_class (str, optional):
25
+ tokenizer_classを指定しないと、pipelineから読み込まれません。
26
+ config.jsonに記載されます。
27
+ """
28
+ self.ngram = ngram
29
+ kwargs["tokenizer_class"] = tokenizer_class
30
+ super().__init__(**kwargs)
31
+
32
+
33
  class FastTextForSeuqenceClassification(FastTextJpModel):
34
  """FastTextのベクトルをベースとした分類を行います。
35
  """
36
 
37
+ def __init__(self, config: FastTextForSeuqenceClassificationConfig):
38
+
39
+ self.ngram = config.ngram
40
  super().__init__(config)
41
 
42
  def forward(self, **inputs) -> SequenceClassifierOutput:
43
+ """候補となるラベルから分類を行います。
44
 
45
  Returns:
46
+ SequenceClassifierOutput: 候補が正解している確率
47
  """
48
  input_ids = inputs["input_ids"]
49
  outputs = self.word_embeddings(input_ids)
50
+
51
+ logits = []
52
+ for idx in range(len(outputs)):
53
+ output = outputs[idx]
54
+ # token_type_ids == 0が文章、1がラベルです。
55
+ token_type_ids = inputs["token_type_ids"][idx]
56
+ # attention_mask == 1がパディングでないもの
57
+ attention_mask = inputs["attention_mask"][idx]
58
+
59
+ sentence = output[torch.logical_and(token_type_ids == 0,
60
+ attention_mask == 1)]
61
+ candidate_label = output[torch.logical_and(token_type_ids == 1,
62
+ attention_mask == 1)]
63
+ sentence_words = self.split_ngram(sentence, self.ngram)
64
+ candidate_label_mean = torch.mean(candidate_label,
65
+ dim=-2,
66
+ keepdim=True)
67
+ p = self.cosine_similarity(sentence_words, candidate_label_mean)
68
+ logits.append([torch.log(p), -torch.inf, torch.log(1 - p)])
 
69
  logits = torch.FloatTensor(logits)
70
  return SequenceClassifierOutput(
71
  loss=None,
72
+ logits=logits,
73
  hidden_states=None,
74
  attentions=None,
75
  )
76
 
77
+ def cosine_similarity(
78
+ self, sentence_words: TensorType["words", "vectors"],
79
+ candidate_label_means: TensorType[1, "vectors"]) -> TensorType[1]:
80
+ res = torch.tensor(0.)
81
+ for sw in sentence_words:
82
+ p = torch.nn.functional.cosine_similarity(sw,
83
+ candidate_label_means[0],
84
+ dim=0)
85
+ if p > res:
86
+ res = p
87
+ return res
88
+
89
+ def split_ngram(self, sentences: TensorType["word", "vectors"],
90
+ n: int) -> TensorType["word", "vectors"]:
91
+ res = []
92
+ for i in range(len(sentences) - n + 1):
93
+ ngram = sentences[i:i + n]
94
+ res.append(torch.mean(ngram, dim=0, keepdim=False))
95
+ return torch.stack(res)
96
+
97
 
98
  # AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
99
  # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
100
+ FastTextForSeuqenceClassificationConfig.register_for_auto_class()
101
  FastTextForSeuqenceClassification.register_for_auto_class(
102
  "AutoModelForSequenceClassification")
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a56d3c08a7e47dbe704cf7eb322380ae83c6b8d2becea81b4ae03da9486c026
3
- size 600829
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba58a6e9bba7142a3d3507fc094345ae2e5ebb222fe98cdf5b2146487895314e
3
+ size 2400000829