sgugger commited on
Commit
04b6c3e
1 Parent(s): bb89bad

Upload model and tool

Browse files
__init__.py ADDED
File without changes
config.json CHANGED
@@ -1,33 +1,36 @@
1
  {
 
2
  "architectures": [
3
  "BertForSequenceClassification"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "classifier_dropout": null,
7
- "custom_pipelines": {
8
- "pair-classification": {
9
- "impl": "pair_classification.PairClassificationPipeline",
10
- "pt": [
11
- "AutoModelForSequenceClassification"
12
- ],
13
- "tf": []
14
- }
15
- },
16
  "hidden_act": "gelu",
17
  "hidden_dropout_prob": 0.1,
18
- "hidden_size": 32,
 
 
 
 
19
  "initializer_range": 0.02,
20
- "intermediate_size": 37,
 
 
 
 
21
  "layer_norm_eps": 1e-12,
22
  "max_position_embeddings": 512,
23
  "model_type": "bert",
24
- "num_attention_heads": 4,
25
- "num_hidden_layers": 5,
26
  "pad_token_id": 0,
27
  "position_embedding_type": "absolute",
 
28
  "torch_dtype": "float32",
29
  "transformers_version": "4.29.0.dev0",
30
  "type_vocab_size": 2,
31
  "use_cache": true,
32
- "vocab_size": 99
33
  }
 
1
  {
2
+ "_name_or_path": "sgugger/bert-finetuned-mrpc",
3
  "architectures": [
4
  "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
8
+ "finetuning_task": "mrpc",
9
+ "gradient_checkpointing": false,
 
 
 
 
 
 
 
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "not_equivalent",
15
+ "1": "equivalent"
16
+ },
17
  "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "equivalent": 1,
21
+ "not_equivalent": 0
22
+ },
23
  "layer_norm_eps": 1e-12,
24
  "max_position_embeddings": 512,
25
  "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
  "pad_token_id": 0,
29
  "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
  "torch_dtype": "float32",
32
  "transformers_version": "4.29.0.dev0",
33
  "type_vocab_size": 2,
34
  "use_cache": true,
35
+ "vocab_size": 28996
36
  }
pair_classification.py DELETED
@@ -1,33 +0,0 @@
1
- import numpy as np
2
-
3
- from transformers import Pipeline
4
-
5
-
6
- def softmax(outputs):
7
- maxes = np.max(outputs, axis=-1, keepdims=True)
8
- shifted_exp = np.exp(outputs - maxes)
9
- return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
10
-
11
-
12
- class PairClassificationPipeline(Pipeline):
13
- def _sanitize_parameters(self, **kwargs):
14
- preprocess_kwargs = {}
15
- if "second_text" in kwargs:
16
- preprocess_kwargs["second_text"] = kwargs["second_text"]
17
- return preprocess_kwargs, {}, {}
18
-
19
- def preprocess(self, text, second_text=None):
20
- return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
21
-
22
- def _forward(self, model_inputs):
23
- return self.model(**model_inputs)
24
-
25
- def postprocess(self, model_outputs):
26
- logits = model_outputs.logits[0].numpy()
27
- probabilities = softmax(logits)
28
-
29
- best_class = np.argmax(probabilities)
30
- label = self.model.config.id2label[best_class]
31
- score = probabilities[best_class].item()
32
- logits = logits.tolist()
33
- return {"label": label, "score": score, "logits": logits}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pair_classification_tool.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
3
+ from transformers.tools import PipelineTool
4
+
5
+
6
+ class TextPairClassificationTool(PipelineTool):
7
+ default_checkpoint = "sgugger/bert-finetuned-mrpc"
8
+ pre_processor_class = AutoTokenizer
9
+ model_class = AutoModelForSequenceClassification
10
+
11
+ description = (
12
+ "classifies if two texts in English are similar or not using the labels {labels}. It takes two inputs named "
13
+ "`text` and `second_text` which should be in English and returns a dictionary with two keys named 'label' "
14
+ "(the predicted label ) and 'score' (the probability associated to it)."
15
+ )
16
+
17
+ def post_init(self):
18
+ if isinstance(self.model, str):
19
+ config = AutoConfig.from_pretrained(self.model)
20
+ else:
21
+ config = self.model.config
22
+
23
+ labels = list(config.label2id.keys())
24
+
25
+ if len(labels) > 1:
26
+ labels = [f"'{label}'" for label in labels]
27
+ labels_string = ", ".join(labels[:-1])
28
+ labels_string += f", and {labels[-1]}"
29
+ else:
30
+ raise ValueError("Not enough labels.")
31
+
32
+ self.description = self.description.replace("{labels}", labels_string)
33
+
34
+ def encode(self, text, second_text):
35
+ return self.pre_processor(text, second_text, return_tensors="pt")
36
+
37
+ def decode(self, outputs):
38
+ logits = outputs.logits
39
+ scores = torch.nn.functional.softmax(logits, dim=-1)
40
+ label_id = torch.argmax(logits[0]).item()
41
+ label = self.model.config.id2label[label_id]
42
+ return {"label": label, "score": scores[0][label_id].item()}
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:882ec9af8732f10b0b2a63bcff2d0b6d245e542dbf9f89143322149fbfd2562e
3
- size 251775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d51a9228c2bfe086be5020b9627e5693324d9f65e7e99bfdb5a1952d213cafa
3
+ size 433320053
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,11 +1,9 @@
1
  {
2
  "clean_up_tokenization_spaces": true,
3
  "cls_token": "[CLS]",
4
- "do_basic_tokenize": true,
5
- "do_lower_case": true,
6
  "mask_token": "[MASK]",
7
- "model_max_length": 1000000000000000019884624838656,
8
- "never_split": null,
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
  "strip_accents": null,
 
1
  {
2
  "clean_up_tokenization_spaces": true,
3
  "cls_token": "[CLS]",
4
+ "do_lower_case": false,
 
5
  "mask_token": "[MASK]",
6
+ "model_max_length": 512,
 
7
  "pad_token": "[PAD]",
8
  "sep_token": "[SEP]",
9
  "strip_accents": null,
tool_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "custom_tools": {"text-pair-classification": "pair_classification_tool.TextPairClassificationTool"}
3
+ }
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff