Kalaoke commited on
Commit
d6df1bf
·
1 Parent(s): 58c3574

add custom handler and modify pipeline

Browse files
Files changed (2) hide show
  1. __pycache__/handler.cpython-37.pyc +0 -0
  2. handler.py +199 -10
__pycache__/handler.cpython-37.pyc CHANGED
Binary files a/__pycache__/handler.cpython-37.pyc and b/__pycache__/handler.cpython-37.pyc differ
 
handler.py CHANGED
@@ -1,11 +1,150 @@
1
- from typing import Dict, List, Any
2
  from dataclasses import dataclass
3
  import torch
4
- from transformers import AutoTokenizer
5
- from transformers import pipeline
 
 
 
 
6
  from transformers.pipelines import PIPELINE_REGISTRY
7
- from bibert_multitask_classification import BiBert_MultiTaskPipeline
8
- from bert_for_sequence_classification import BertForSequenceClassification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  @dataclass
11
  class Task:
@@ -14,17 +153,67 @@ class Task:
14
  type: str
15
  num_labels: int
16
 
17
- PIPELINE_REGISTRY.register_pipeline(
18
- "bibert-multitask-classification",
19
- pipeline_class=BiBert_MultiTaskPipeline,
20
- pt_model=BertForSequenceClassification
21
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class EndpointHandler():
23
  def __init__(self, path=""):
24
  # Preload all the elements you are going to need at inference.
25
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
  tokenizer = AutoTokenizer.from_pretrained(path)
27
 
 
28
  tasks = [
29
  Task(id=0, name='label_classification', type='seq_classification', num_labels=5),
30
  Task(id=1, name='binary_classification', type='seq_classification', num_labels=2)
 
1
+ from typing import Dict, List, Any, Optional, Tuple, Union
2
  from dataclasses import dataclass
3
  import torch
4
+ from torch import nn
5
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
6
+ import numpy as np
7
+ import transformers
8
+ from transformers import AutoTokenizer, BertTokenizer
9
+ from transformers import Pipeline, pipeline
10
  from transformers.pipelines import PIPELINE_REGISTRY
11
+ from transformers import models
12
+ from transformers.modeling_outputs import SequenceClassifierOutput
13
+ from transformers.models.bert.configuration_bert import BertConfig
14
+ from transformers.models.bert.modeling_bert import (
15
+ BertPreTrainedModel,
16
+ BERT_INPUTS_DOCSTRING,
17
+ _TOKENIZER_FOR_DOC,
18
+ _CHECKPOINT_FOR_DOC,
19
+ BERT_START_DOCSTRING,
20
+ _CONFIG_FOR_DOC,
21
+ _SEQ_CLASS_EXPECTED_OUTPUT,
22
+ _SEQ_CLASS_EXPECTED_LOSS,
23
+ BertModel,
24
+ )
25
+
26
+ from transformers.file_utils import (
27
+ add_code_sample_docstrings,
28
+ add_start_docstrings_to_model_forward,
29
+ add_start_docstrings
30
+ )
31
+
32
+ @add_start_docstrings(
33
+ """
34
+ Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
35
+ output) e.g. for GLUE tasks.
36
+ """,
37
+ BERT_START_DOCSTRING,
38
+ )
39
+ class BertForSequenceClassification(BertPreTrainedModel):
40
+ def __init__(self, config, **kwargs):
41
+ super().__init__(transformers.PretrainedConfig())
42
+ #task_labels_map={"binary_classification": 2, "label_classification": 5}
43
+ self.tasks = kwargs.get("tasks_map", {})
44
+ self.config = config
45
+
46
+ self.bert = BertModel(config)
47
+ classifier_dropout = (
48
+ config.classifier_dropout
49
+ if config.classifier_dropout is not None
50
+ else config.hidden_dropout_prob
51
+ )
52
+ self.dropout = nn.Dropout(classifier_dropout)
53
+ ## add task specific output heads
54
+ self.classifier1 = nn.Linear(
55
+ config.hidden_size, self.tasks[0].num_labels
56
+ )
57
+ self.classifier2 = nn.Linear(
58
+ config.hidden_size, self.tasks[1].num_labels
59
+ )
60
+
61
+ self.init_weights()
62
+
63
+ @add_start_docstrings_to_model_forward(
64
+ BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")
65
+ )
66
+ @add_code_sample_docstrings(
67
+ processor_class=_TOKENIZER_FOR_DOC,
68
+ checkpoint=_CHECKPOINT_FOR_DOC,
69
+ output_type=SequenceClassifierOutput,
70
+ config_class=_CONFIG_FOR_DOC,
71
+ expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
72
+ expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
73
+ )
74
+ def forward(
75
+ self,
76
+ input_ids: Optional[torch.Tensor] = None,
77
+ attention_mask: Optional[torch.Tensor] = None,
78
+ token_type_ids: Optional[torch.Tensor] = None,
79
+ position_ids: Optional[torch.Tensor] = None,
80
+ head_mask: Optional[torch.Tensor] = None,
81
+ inputs_embeds: Optional[torch.Tensor] = None,
82
+ labels: Optional[torch.Tensor] = None,
83
+ output_attentions: Optional[bool] = None,
84
+ output_hidden_states: Optional[bool] = None,
85
+ return_dict: Optional[bool] = None,
86
+ task_ids=None,
87
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
88
+ r"""
89
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
90
+ Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
91
+ config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
92
+ If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
93
+ """
94
+ return_dict = (
95
+ return_dict if return_dict is not None else self.config.use_return_dict
96
+ )
97
+
98
+ outputs = self.bert(
99
+ input_ids,
100
+ attention_mask=attention_mask,
101
+ token_type_ids=token_type_ids,
102
+ position_ids=position_ids,
103
+ head_mask=head_mask,
104
+ inputs_embeds=inputs_embeds,
105
+ output_attentions=output_attentions,
106
+ output_hidden_states=output_hidden_states,
107
+ return_dict=return_dict,
108
+ )
109
+
110
+ pooled_output = outputs[1]
111
+
112
+ pooled_output = self.dropout(pooled_output)
113
+
114
+ unique_task_ids_list = torch.unique(task_ids).tolist()
115
+ loss_list = []
116
+ logits = None
117
+ for unique_task_id in unique_task_ids_list:
118
+
119
+ loss = None
120
+ task_id_filter = task_ids == unique_task_id
121
+
122
+ if unique_task_id == 0:
123
+ logits = self.classifier1(pooled_output[task_id_filter])
124
+ elif unique_task_id == 1:
125
+ logits = self.classifier2(pooled_output[task_id_filter])
126
+
127
+
128
+ if labels is not None:
129
+ loss_fct = CrossEntropyLoss()
130
+ loss = loss_fct(logits.view(-1, self.tasks[unique_task_id].num_labels), labels[task_id_filter].view(-1))
131
+ loss_list.append(loss)
132
+
133
+ # logits are only used for eval. and in case of eval the batch is not multi task
134
+ # For training only the loss is used
135
+
136
+ if loss_list:
137
+ loss = torch.stack(loss_list).mean()
138
+ if not return_dict:
139
+ output = (logits,) + outputs[2:]
140
+ return ((loss,) + output) if loss is not None else output
141
+
142
+ return SequenceClassifierOutput(
143
+ loss=loss,
144
+ logits=logits,
145
+ hidden_states=outputs.hidden_states,
146
+ attentions=outputs.attentions,
147
+ )
148
 
149
  @dataclass
150
  class Task:
 
153
  type: str
154
  num_labels: int
155
 
156
+ def softmax(_outputs):
157
+ maxes = np.max(_outputs, axis=-1, keepdims=True)
158
+ shifted_exp = np.exp(_outputs - maxes)
159
+ return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
160
+
161
+ class BiBert_MultiTaskPipeline(Pipeline):
162
+
163
+
164
+ def _sanitize_parameters(self, **kwargs):
165
+
166
+ preprocess_kwargs = {}
167
+ if "task_id" in kwargs:
168
+ preprocess_kwargs["task_id"] = kwargs["task_id"]
169
+
170
+ forward_kwargs = {}
171
+ if "task_id" in kwargs:
172
+ forward_kwargs["task_id"] = kwargs["task_id"]
173
+
174
+ postprocess_kwargs = {}
175
+ if "top_k" in kwargs:
176
+ postprocess_kwargs["top_k"] = kwargs["top_k"]
177
+ postprocess_kwargs["_legacy"] = False
178
+ return preprocess_kwargs, forward_kwargs, postprocess_kwargs
179
+
180
+
181
+
182
+ def preprocess(self, inputs, task_id):
183
+ return_tensors = self.framework
184
+ feature = self.tokenizer(inputs, padding = True, return_tensors=return_tensors).to(self.device)
185
+ task_ids = np.full(shape=1,fill_value=task_id, dtype=int)
186
+ feature["task_ids"] = torch.IntTensor(task_ids)
187
+ return feature
188
+
189
+ def _forward(self, model_inputs, task_id):
190
+ return self.model(**model_inputs)
191
+
192
+ def postprocess(self, model_outputs, top_k=1, _legacy=True):
193
+ outputs = model_outputs["logits"][0]
194
+ outputs = outputs.numpy()
195
+ scores = softmax(outputs)
196
+
197
+ if top_k == 1 and _legacy:
198
+ return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
199
+
200
+ dict_scores = [
201
+ {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
202
+ ]
203
+ if not _legacy:
204
+ dict_scores.sort(key=lambda x: x["score"], reverse=True)
205
+ if top_k is not None:
206
+ dict_scores = dict_scores[:top_k]
207
+ return dict_scores
208
+
209
+
210
  class EndpointHandler():
211
  def __init__(self, path=""):
212
  # Preload all the elements you are going to need at inference.
213
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
214
  tokenizer = AutoTokenizer.from_pretrained(path)
215
 
216
+ PIPELINE_REGISTRY.register_pipeline("bibert-multitask-classification", pipeline_class=BiBert_MultiTaskPipeline, pt_model=BertForSequenceClassification)
217
  tasks = [
218
  Task(id=0, name='label_classification', type='seq_classification', num_labels=5),
219
  Task(id=1, name='binary_classification', type='seq_classification', num_labels=2)