minskiter commited on
Commit
fdb0b54
1 Parent(s): 747fa1f

feat(similar.py): update pipeline

Browse files
README.md CHANGED
@@ -1,3 +1,11 @@
1
  ---
 
 
2
  license: apache-2.0
 
 
 
 
3
  ---
 
 
 
1
  ---
2
+ language:
3
+ - zh
4
  license: apache-2.0
5
+ tags:
6
+ - bert
7
+ - similar
8
+ pipeline_tag: other
9
  ---
10
+
11
+ ### BERT 中文相似度计算
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "minskiter/simbert-chinese-bert-wwm-ext",
3
+ "architectures": [
4
+ "SimBertModel"
5
+ ],
6
+ "custom_pipelines": {
7
+ "sentences_sim": {
8
+ "impl": "minskiter/simbert-chinese-bert-wwm-ext--similar.SimilarPipeline",
9
+ "pt": "AutoModel",
10
+ "tf": []
11
+ }
12
+ },
13
+ "auto_map":{
14
+ "AutoModel": "minskiter/simbert-chinese-bert-wwm-ext--modeling_bert.SimBertModel",
15
+ "AutoConfig": "minskiter/simbert-chinese-bert-wwm-ext--configuration_bert.SimBertConfig"
16
+ },
17
+ "attention_probs_dropout_prob": 0.1,
18
+ "classifier_dropout": null,
19
+ "directionality": "bidi",
20
+ "hidden_act": "gelu",
21
+ "hidden_dropout_prob": 0.1,
22
+ "hidden_size": 768,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "simbert",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "output_past": true,
31
+ "pad_token_id": 0,
32
+ "pooler_fc_size": 768,
33
+ "pooler_num_attention_heads": 12,
34
+ "pooler_num_fc_layers": 3,
35
+ "pooler_size_per_head": 128,
36
+ "pooler_type": "first_token_transform",
37
+ "position_embedding_type": "absolute",
38
+ "torch_dtype": "float32",
39
+ "transformers_version": "4.30.1",
40
+ "type_vocab_size": 2,
41
+ "use_cache": true,
42
+ "vocab_size": 21128
43
+ }
configuration_bert.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from transformers import BertConfig
2
+
3
+ class SimBertConfig(BertConfig):
4
+
5
+ model_type = "simbert"
modeling_bert.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig, PreTrainedModel, BertModel, BertConfig
2
+ from torch import nn
3
+
4
+ class SimBertModel(PreTrainedModel):
5
+ """ SimBert Model
6
+ """
7
+
8
+ config_class = BertConfig
9
+
10
+ def __init__(
11
+ self,
12
+ config: PretrainedConfig
13
+ ) -> None:
14
+ super().__init__(config)
15
+ self.bert = BertModel(config=config, add_pooling_layer=True)
16
+ self.fc = nn.Linear(config.hidden_size, 2)
17
+ # self.loss_fct = nn.CrossEntropyLoss()
18
+ self.loss_fct = nn.MSELoss()
19
+ self.softmax = nn.Softmax(dim=1)
20
+
21
+ def forward(
22
+ self,
23
+ input_ids,
24
+ token_type_ids,
25
+ attention_mask,
26
+ labels=None
27
+ ):
28
+ outputs = self.bert(
29
+ input_ids=input_ids,
30
+ attention_mask=attention_mask,
31
+ token_type_ids=token_type_ids
32
+ )
33
+ pooled_output = outputs.pooler_output
34
+ logits = self.fc(pooled_output)
35
+ logits = self.softmax(logits)[:,1]
36
+ if labels is not None:
37
+ loss = self.loss_fct(logits.view(-1), labels.view(-1))
38
+ return loss, logits
39
+ return None, logits
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73394cb3357f031cd1946df0acd09dfd7d0983bbb172aa9df3916028acf4a22
3
+ size 409149557
similar.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Tuple
2
+ from transformers import Pipeline
3
+ from transformers.pipelines.base import GenericTensor
4
+ from transformers.utils import ModelOutput
5
+ from typing import Union,List
6
+
7
+ class SimilarPipeline(Pipeline):
8
+ def __init__(self, max_length=512,*args, **kwargs):
9
+ super().__init__(*args, **kwargs)
10
+ self.max_length = max_length
11
+
12
+ def _sanitize_parameters(self, **pipeline_parameters):
13
+ return {},{},{}
14
+
15
+ def preprocess(self, input: Union[Tuple[str],List[Tuple[str]]], **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
16
+ if isinstance(input, list):
17
+ a = list(map(lambda x: x[0], input))
18
+ b = list(map(lambda x: x[1], input))
19
+ else:
20
+ a = input[0]
21
+ b = input[1]
22
+ tensors = self.tokenizer(
23
+ a,
24
+ b,
25
+ max_length=self.max_length,
26
+ padding="max_length",
27
+ truncation=True,
28
+ return_tensors="pt",
29
+ )
30
+ return tensors
31
+
32
+ def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
33
+ _,logits = self.model(**input_tensors)
34
+ return logits.tolist()
35
+
36
+ def postprocess(
37
+ self,
38
+ model_outputs: ModelOutput,
39
+ **postprocess_parameters: Dict
40
+ ) -> Any:
41
+ return model_outputs
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "padding": "max_length",
3
+ "max_length": 512,
4
+ "name_or_path": "hfl/chinese-bert-wwm-ext",
5
+ "tokenizer_class": "BertTokenizer"
6
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff