minskiter commited on
Commit
57034b1
1 Parent(s): 81a6267

feat(model): update model parameters

Browse files
README.md CHANGED
@@ -1,3 +1,11 @@
1
  ---
 
 
2
  license: apache-2.0
 
 
 
 
3
  ---
 
 
 
1
  ---
2
+ language:
3
+ - zh
4
  license: apache-2.0
5
+ tags:
6
+ - bert
7
+ - similar
8
+ pipeline_tag: other
9
  ---
10
+
11
+ ### BERT 中文相似度计算
config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "minskiter/cossim-bert-chinese-wwm-ext",
3
+ "architectures": [
4
+ "CosSimBertModel"
5
+ ],
6
+ "tokenizer_class": "BertTokenizer",
7
+ "custom_pipelines": {
8
+ "sentences_sim": {
9
+ "impl": "minskiter/cossim-bert-chinese-wwm-ext--similar.SimilarPipeline",
10
+ "pt": "AutoModel",
11
+ "tf": []
12
+ },
13
+ "textencode": {
14
+ "impl": "minskiter/cossim-bert-chinese-wwm-ext--similar.EncodePipeline",
15
+ "pt": "AutoModel",
16
+ "tf": []
17
+ }
18
+ },
19
+ "auto_map":{
20
+ "AutoModel": "minskiter/cossim-bert-chinese-wwm-ext--modeling_bert.CosSimBertModel",
21
+ "AutoConfig": "minskiter/cossim-bert-chinese-wwm-ext--configuration_bert.SimBertConfig"
22
+ },
23
+ "attention_probs_dropout_prob": 0.1,
24
+ "classifier_dropout": null,
25
+ "directionality": "bidi",
26
+ "hidden_act": "gelu",
27
+ "hidden_dropout_prob": 0.1,
28
+ "hidden_size": 768,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 3072,
31
+ "layer_norm_eps": 1e-12,
32
+ "max_position_embeddings": 512,
33
+ "model_type": "simbert",
34
+ "num_attention_heads": 12,
35
+ "num_hidden_layers": 12,
36
+ "output_past": true,
37
+ "pad_token_id": 0,
38
+ "pooler_fc_size": 768,
39
+ "pooler_num_attention_heads": 12,
40
+ "pooler_num_fc_layers": 3,
41
+ "pooler_size_per_head": 128,
42
+ "pooler_type": "first_token_transform",
43
+ "position_embedding_type": "absolute",
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.30.1",
46
+ "type_vocab_size": 2,
47
+ "use_cache": true,
48
+ "vocab_size": 21128
49
+ }
configuration_bert.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from transformers import BertConfig
2
+
3
+ class SimBertConfig(BertConfig):
4
+
5
+ model_type = "simbert"
modeling_bert.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig, PreTrainedModel, BertModel, BertConfig
2
+ from .configuration_bert import SimBertConfig
3
+ from torch import nn
4
+
5
+ class SimBertModel(PreTrainedModel):
6
+ """ SimBert Model
7
+ """
8
+
9
+ config_class = SimBertConfig
10
+
11
+ def __init__(
12
+ self,
13
+ config: PretrainedConfig
14
+ ) -> None:
15
+ super().__init__(config)
16
+ self.bert = BertModel(config=config, add_pooling_layer=True)
17
+ self.fc = nn.Linear(config.hidden_size, 2)
18
+ # self.loss_fct = nn.CrossEntropyLoss()
19
+ self.loss_fct = nn.MSELoss()
20
+ self.softmax = nn.Softmax(dim=1)
21
+
22
+ def forward(
23
+ self,
24
+ input_ids,
25
+ token_type_ids,
26
+ attention_mask,
27
+ labels=None
28
+ ):
29
+ outputs = self.bert(
30
+ input_ids=input_ids,
31
+ attention_mask=attention_mask,
32
+ token_type_ids=token_type_ids
33
+ )
34
+ pooled_output = outputs.pooler_output
35
+ logits = self.fc(pooled_output)
36
+ logits = self.softmax(logits)[:,1]
37
+ if labels is not None:
38
+ loss = self.loss_fct(logits.view(-1), labels.view(-1))
39
+ return loss, logits
40
+ return None, logits
41
+
42
+ class CosSimBertModel(PreTrainedModel):
43
+ """ CosSimBert Model
44
+ """
45
+
46
+ config_class = SimBertConfig
47
+
48
+ def __init__(
49
+ self,
50
+ config: PretrainedConfig
51
+ ) -> None:
52
+ super().__init__(config)
53
+ self.bert = BertModel(config=config, add_pooling_layer=True)
54
+ self.loss_fct = nn.MSELoss()
55
+ self.softmax = nn.Softmax(dim=1)
56
+
57
+ def forward(
58
+ self,
59
+ input_ids,
60
+ token_type_ids,
61
+ attention_mask,
62
+ labels=None
63
+ ):
64
+ seq_length = input_ids.size(-1)
65
+ a = {
66
+ "input_ids": input_ids[:,:seq_length//2],
67
+ "token_type_ids": token_type_ids[:,:seq_length//2],
68
+ "attention_mask": attention_mask[:,:seq_length//2]
69
+ }
70
+ b = {
71
+ "input_ids": input_ids[:,seq_length//2:],
72
+ "token_type_ids": token_type_ids[:,seq_length//2:],
73
+ "attention_mask": attention_mask[:,seq_length//2:]
74
+ }
75
+ outputs_a = self.bert(**a)
76
+ outputs_b = self.bert(**b)
77
+ pooled_a_output = outputs_a.pooler_output
78
+ pooled_b_output = outputs_b.pooler_output
79
+ logits = nn.functional.cosine_similarity(pooled_a_output, pooled_b_output)
80
+ if labels is not None:
81
+ loss = self.loss_fct(logits.view(-1), labels.view(-1))
82
+ return loss, logits
83
+ return None, logits
84
+
85
+ def encode(
86
+ self,
87
+ input_ids,
88
+ token_type_ids,
89
+ attention_mask,
90
+ ):
91
+ outputs = self.bert(
92
+ input_ids=input_ids,
93
+ attention_mask=attention_mask,
94
+ token_type_ids=token_type_ids
95
+ )
96
+ pooled_output = outputs.pooler_output
97
+ return pooled_output
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a89f77c91c7e9b1bdce180a136bc257fedaf753168ffffb47be07736b01ab80d
3
+ size 409142765
similar.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Tuple
2
+ from transformers import Pipeline
3
+ from transformers.pipelines.base import GenericTensor
4
+ from transformers.utils import ModelOutput
5
+ from typing import Union,List
6
+ import torch
7
+
8
+ class EncodePipeline(Pipeline):
9
+ def __init__(self, max_length=256,*args, **kwargs):
10
+ super().__init__(*args, **kwargs)
11
+ self.max_length = max_length
12
+
13
+ def _sanitize_parameters(self, **pipeline_parameters):
14
+ return {},{},{}
15
+
16
+ def preprocess(self, input: Union[Tuple[str],List[Tuple[str]]], **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
17
+ tensors = self.tokenizer(
18
+ input,
19
+ max_length=self.max_length,
20
+ padding="max_length",
21
+ truncation=True,
22
+ return_tensors="pt",
23
+ )
24
+ return tensors
25
+
26
+ def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
27
+ logits = self.model.encode(**input_tensors)
28
+ return logits.tolist()
29
+
30
+ def postprocess(
31
+ self,
32
+ model_outputs: ModelOutput,
33
+ **postprocess_parameters: Dict
34
+ ) -> Any:
35
+ return model_outputs
36
+
37
+
38
+ class SimilarPipeline(Pipeline):
39
+ def __init__(self, max_length=256,*args, **kwargs):
40
+ super().__init__(*args, **kwargs)
41
+ self.max_length = max_length
42
+
43
+ def _sanitize_parameters(self, **pipeline_parameters):
44
+ return {},{},{}
45
+
46
+ def preprocess(self, input: Union[Tuple[str],List[Tuple[str]]], **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
47
+ if isinstance(input, list):
48
+ a = list(map(lambda x: x[0], input))
49
+ b = list(map(lambda x: x[1], input))
50
+ else:
51
+ a = input[0]
52
+ b = input[1]
53
+ tensors = self.tokenizer(
54
+ a,
55
+ max_length=self.max_length,
56
+ padding="max_length",
57
+ truncation=True,
58
+ return_tensors="pt",
59
+ )
60
+ tensors_b = self.tokenizer(
61
+ b,
62
+ max_length=self.max_length,
63
+ padding="max_length",
64
+ truncation=True,
65
+ return_tensors="pt",
66
+ )
67
+ for key in tensors:
68
+ tensors[key] = torch.cat((tensors[key],tensors_b[key]),dim=0)
69
+ return tensors
70
+
71
+ def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
72
+ _,logits = self.model(**input_tensors)
73
+ logits_a = logits[:logits.size(0)//2]
74
+ logits_b = logits[logits.size(0)//2:]
75
+ logits = torch.nn.functional.cosine_similarity(logits_a, logits_b)
76
+ return logits.tolist()
77
+
78
+ def postprocess(
79
+ self,
80
+ model_outputs: ModelOutput,
81
+ **postprocess_parameters: Dict
82
+ ) -> Any:
83
+ return model_outputs
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "padding": "max_length",
3
+ "max_length": 512,
4
+ "name_or_path": "hfl/chinese-bert-wwm-ext",
5
+ "tokenizer_class": "BertTokenizer"
6
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff