omri374 commited on
Commit
46c6fe7
1 Parent(s): 9998ba1

Upload 2 files

Browse files
transformers_rec/configuration.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py
2
+
3
+ STANFORD_COFIGURATION = {
4
+ "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
5
+ "PRESIDIO_SUPPORTED_ENTITIES": [
6
+ "LOCATION",
7
+ "PERSON",
8
+ "ORGANIZATION",
9
+ "AGE",
10
+ "PHONE_NUMBER",
11
+ "EMAIL",
12
+ "DATE_TIME",
13
+ "DEVICE",
14
+ "ZIP",
15
+ "PROFESSION",
16
+ "USERNAME",
17
+ "ID"
18
+
19
+ ],
20
+ "LABELS_TO_IGNORE": ["O"],
21
+ "DEFAULT_EXPLANATION": "Identified as {} by the StanfordAIMI/stanford-deidentifier-base NER model",
22
+ "SUB_WORD_AGGREGATION": "simple",
23
+ "DATASET_TO_PRESIDIO_MAPPING": {
24
+ "DATE": "DATE_TIME",
25
+ "DOCTOR": "PERSON",
26
+ "PATIENT": "PERSON",
27
+ "HOSPITAL": "LOCATION",
28
+ "MEDICALRECORD": "ID",
29
+ "IDNUM": "ID",
30
+ "ORGANIZATION": "ORGANIZATION",
31
+ "ZIP": "ZIP",
32
+ "PHONE": "PHONE_NUMBER",
33
+ "USERNAME": "USERNAME",
34
+ "STREET": "LOCATION",
35
+ "PROFESSION": "PROFESSION",
36
+ "COUNTRY": "LOCATION",
37
+ "LOCATION-OTHER": "LOCATION",
38
+ "FAX": "PHONE_NUMBER",
39
+ "EMAIL": "EMAIL",
40
+ "STATE": "LOCATION",
41
+ "DEVICE": "DEVICE",
42
+ "ORG": "ORGANIZATION",
43
+ "AGE": "AGE",
44
+ },
45
+ "MODEL_TO_PRESIDIO_MAPPING": {
46
+ "PER": "PERSON",
47
+ "PERSON": "PERSON",
48
+ "LOC": "LOCATION",
49
+ "ORG": "ORGANIZATION",
50
+ "AGE": "AGE",
51
+ "PATIENT": "PERSON",
52
+ "HCW": "PERSON",
53
+ "HOSPITAL": "LOCATION",
54
+ "PATORG": "ORGANIZATION",
55
+ "DATE": "DATE_TIME",
56
+ "PHONE": "PHONE_NUMBER",
57
+ "VENDOR": "ORGANIZATION",
58
+ },
59
+ "CHUNK_OVERLAP_SIZE": 40,
60
+ "CHUNK_SIZE": 600,
61
+ "ID_SCORE_MULTIPLIER": 0.4,
62
+ "ID_ENTITY_NAME": "ID"
63
+ }
64
+
65
+
66
+ BERT_DEID_CONFIGURATION = {
67
+ "PRESIDIO_SUPPORTED_ENTITIES": [
68
+ "LOCATION",
69
+ "PERSON",
70
+ "ORGANIZATION",
71
+ "AGE",
72
+ "PHONE_NUMBER",
73
+ "EMAIL",
74
+ "DATE_TIME",
75
+ "ZIP",
76
+ "PROFESSION",
77
+ "USERNAME",
78
+ "ID"
79
+ ],
80
+ "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
81
+ "LABELS_TO_IGNORE": ["O"],
82
+ "DEFAULT_EXPLANATION": "Identified as {} by the obi/deid_roberta_i2b2 NER model",
83
+ "SUB_WORD_AGGREGATION": "simple",
84
+ "DATASET_TO_PRESIDIO_MAPPING": {
85
+ "DATE": "DATE_TIME",
86
+ "DOCTOR": "PERSON",
87
+ "PATIENT": "PERSON",
88
+ "HOSPITAL": "ORGANIZATION",
89
+ "MEDICALRECORD": "O",
90
+ "IDNUM": "O",
91
+ "ORGANIZATION": "ORGANIZATION",
92
+ "ZIP": "O",
93
+ "PHONE": "PHONE_NUMBER",
94
+ "USERNAME": "",
95
+ "STREET": "LOCATION",
96
+ "PROFESSION": "PROFESSION",
97
+ "COUNTRY": "LOCATION",
98
+ "LOCATION-OTHER": "LOCATION",
99
+ "FAX": "PHONE_NUMBER",
100
+ "EMAIL": "EMAIL",
101
+ "STATE": "LOCATION",
102
+ "DEVICE": "O",
103
+ "ORG": "ORGANIZATION",
104
+ "AGE": "AGE",
105
+ },
106
+ "MODEL_TO_PRESIDIO_MAPPING": {
107
+ "PER": "PERSON",
108
+ "LOC": "LOCATION",
109
+ "ORG": "ORGANIZATION",
110
+ "AGE": "AGE",
111
+ "ID": "ID",
112
+ "EMAIL": "EMAIL",
113
+ "PATIENT": "PERSON",
114
+ "STAFF": "PERSON",
115
+ "HOSP": "ORGANIZATION",
116
+ "PATORG": "ORGANIZATION",
117
+ "DATE": "DATE_TIME",
118
+ "PHONE": "PHONE_NUMBER",
119
+ },
120
+ "CHUNK_OVERLAP_SIZE": 40,
121
+ "CHUNK_SIZE": 600,
122
+ "ID_SCORE_MULTIPLIER": 0.4,
123
+ "ID_ENTITY_NAME": "ID"
124
+ }
transformers_rec/transformers_recognizer.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py
2
+
3
+ import copy
4
+ import logging
5
+ from typing import Optional, List
6
+
7
+ import torch
8
+ from presidio_analyzer import (
9
+ RecognizerResult,
10
+ EntityRecognizer,
11
+ AnalysisExplanation,
12
+ )
13
+ from presidio_analyzer.nlp_engine import NlpArtifacts
14
+
15
+ from .configuration import BERT_DEID_CONFIGURATION
16
+
17
+
18
+ logger = logging.getLogger("presidio-analyzer")
19
+
20
+ try:
21
+ from transformers import (
22
+ AutoTokenizer,
23
+ AutoModelForTokenClassification,
24
+ pipeline,
25
+ TokenClassificationPipeline,
26
+ )
27
+
28
+ except ImportError:
29
+ logger.error("transformers_rec is not installed")
30
+
31
+
32
+ class TransformersRecognizer(EntityRecognizer):
33
+ """
34
+ Wrapper for a transformers_rec model, if needed to be used within Presidio Analyzer.
35
+ The class loads models hosted on HuggingFace - https://huggingface.co/
36
+ and loads the model and tokenizer into a TokenClassification pipeline.
37
+ Samples are split into short text chunks, ideally shorter than max_length input_ids of the individual model,
38
+ to avoid truncation by the Tokenizer and loss of information
39
+
40
+ A configuration object should be maintained for each dataset-model combination and translate
41
+ entities names into a standardized view. A sample of a configuration file is attached in
42
+ the example.
43
+ :param supported_entities: List of entities to run inference on
44
+ :type supported_entities: Optional[List[str]]
45
+ :param pipeline: Instance of a TokenClassificationPipeline including a Tokenizer and a Model, defaults to None
46
+ :type pipeline: Optional[TokenClassificationPipeline], optional
47
+ :param model_path: string referencing a HuggingFace uploaded model to be used for Inference, defaults to None
48
+ :type model_path: Optional[str], optional
49
+
50
+ :example
51
+ >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
52
+ >model_path = "obi/deid_roberta_i2b2"
53
+ >transformers_recognizer = TransformersRecognizer(model_path=model_path,
54
+ >supported_entities = model_configuration.get("PRESIDIO_SUPPORTED_ENTITIES"))
55
+ >transformers_recognizer.load_transformer(**model_configuration)
56
+ >registry = RecognizerRegistry()
57
+ >registry.add_recognizer(transformers_recognizer)
58
+ >analyzer = AnalyzerEngine(registry=registry)
59
+ >sample = "My name is Christopher and I live in Irbid."
60
+ >results = analyzer.analyze(sample, language="en",return_decision_process=True)
61
+
62
+ >for result in results:
63
+ > print(result,'----', sample[result.start:result.end])
64
+ """
65
+
66
+ def load(self) -> None:
67
+ pass
68
+
69
+ def __init__(
70
+ self,
71
+ model_path: Optional[str] = None,
72
+ pipeline: Optional[TokenClassificationPipeline] = None,
73
+ supported_entities: Optional[List[str]] = None,
74
+ ):
75
+ if not supported_entities:
76
+ supported_entities = BERT_DEID_CONFIGURATION[
77
+ "PRESIDIO_SUPPORTED_ENTITIES"
78
+ ]
79
+ super().__init__(
80
+ supported_entities=supported_entities,
81
+ name=f"Transformers model {model_path}",
82
+ )
83
+
84
+ self.model_path = model_path
85
+ self.pipeline = pipeline
86
+ self.is_loaded = False
87
+
88
+ self.aggregation_mechanism = None
89
+ self.ignore_labels = None
90
+ self.model_to_presidio_mapping = None
91
+ self.entity_mapping = None
92
+ self.default_explanation = None
93
+ self.text_overlap_length = None
94
+ self.chunk_length = None
95
+ self.id_entity_name = None
96
+ self.id_score_reduction = None
97
+
98
+ def load_transformer(self, **kwargs) -> None:
99
+ """Load external configuration parameters and set default values.
100
+
101
+ :param kwargs: define default values for class attributes and modify pipeline behavior
102
+ **DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format
103
+ **MODEL_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from chosen model format to Presidio format
104
+ **SUB_WORD_AGGREGATION(str) - define how to aggregate sub-word tokens into full words and spans as defined
105
+ in HuggingFace https://huggingface.co/transformers/v4.8.0/main_classes/pipelines.html#transformers.TokenClassificationPipeline # noqa
106
+ **CHUNK_OVERLAP_SIZE (int) - number of overlapping characters in each text chunk
107
+ when splitting a single text into multiple inferences
108
+ **CHUNK_SIZE (int) - number of characters in each chunk of text
109
+ **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
110
+ **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
111
+ **ID_ENTITY_NAME (str) - name of the ID entity
112
+ **ID_SCORE_REDUCTION (float) - score multiplier for ID entities
113
+ """
114
+
115
+ self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
116
+ self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {})
117
+ self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"])
118
+ self.aggregation_mechanism = kwargs.get("SUB_WORD_AGGREGATION", "simple")
119
+ self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
120
+ self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
121
+ self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
122
+ self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
123
+ self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
124
+
125
+ if not self.pipeline:
126
+ if not self.model_path:
127
+ self.model_path = "obi/deid_roberta_i2b2"
128
+ logger.warning(
129
+ f"Both 'model' and 'model_path' arguments are None. Using default model_path={self.model_path}"
130
+ )
131
+
132
+ self._load_pipeline()
133
+
134
+ def _load_pipeline(self) -> None:
135
+ """Initialize NER transformers_rec pipeline using the model_path provided"""
136
+
137
+ logging.debug(f"Initializing NER pipeline using {self.model_path} path")
138
+ device = 0 if torch.cuda.is_available() else -1
139
+ self.pipeline = pipeline(
140
+ "ner",
141
+ model=AutoModelForTokenClassification.from_pretrained(self.model_path),
142
+ tokenizer=AutoTokenizer.from_pretrained(self.model_path),
143
+ # Will attempt to group sub-entities to word level
144
+ aggregation_strategy=self.aggregation_mechanism,
145
+ device=device,
146
+ framework="pt",
147
+ ignore_labels=self.ignore_labels,
148
+ )
149
+
150
+ self.is_loaded = True
151
+
152
+ def get_supported_entities(self) -> List[str]:
153
+ """
154
+ Return supported entities by this model.
155
+ :return: List of the supported entities.
156
+ """
157
+ return self.supported_entities
158
+
159
+ # Class to use transformers_rec with Presidio as an external recognizer.
160
+ def analyze(
161
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
162
+ ) -> List[RecognizerResult]:
163
+ """
164
+ Analyze text using transformers_rec model to produce NER tagging.
165
+ :param text : The text for analysis.
166
+ :param entities: Not working properly for this recognizer.
167
+ :param nlp_artifacts: Not used by this recognizer.
168
+ :return: The list of Presidio RecognizerResult constructed from the recognized
169
+ transformers_rec detections.
170
+ """
171
+
172
+ results = list()
173
+ # Run transformer model on the provided text
174
+ ner_results = self._get_ner_results_for_text(text)
175
+
176
+ for res in ner_results:
177
+ print(f"res: {res}")
178
+ res["entity_group"] = self.__check_label_transformer(res["entity_group"])
179
+ print(f"res[entity_group]: {res['entity_group']}")
180
+ print("---")
181
+ if not res["entity_group"]:
182
+ continue
183
+
184
+ if res["entity_group"] == self.id_entity_name:
185
+ print(f"ID entity found, multiplying score by {self.id_score_reduction}")
186
+ res["score"] = res["score"] * self.id_score_reduction
187
+
188
+ textual_explanation = self.default_explanation.format(res["entity_group"])
189
+ explanation = self.build_transformers_explanation(
190
+ float(round(res["score"], 2)), textual_explanation, res["word"]
191
+ )
192
+ transformers_result = self._convert_to_recognizer_result(res, explanation)
193
+
194
+ results.append(transformers_result)
195
+
196
+ return results
197
+
198
+ @staticmethod
199
+ def split_text_to_word_chunks(
200
+ input_length: int, chunk_length: int, overlap_length: int
201
+ ) -> List[List]:
202
+ """The function calculates chunks of text with size chunk_length. Each chunk has overlap_length number of
203
+ words to create context and continuity for the model
204
+
205
+ :param input_length: Length of input_ids for a given text
206
+ :type input_length: int
207
+ :param chunk_length: Length of each chunk of input_ids.
208
+ Should match the max input length of the transformer model
209
+ :type chunk_length: int
210
+ :param overlap_length: Number of overlapping words in each chunk
211
+ :type overlap_length: int
212
+ :return: List of start and end positions for individual text chunks
213
+ :rtype: List[List]
214
+ """
215
+ if input_length < chunk_length:
216
+ return [[0, input_length]]
217
+ if chunk_length <= overlap_length:
218
+ logger.warning(
219
+ "overlap_length should be shorter than chunk_length, setting overlap_length to by half of chunk_length"
220
+ )
221
+ overlap_length = chunk_length // 2
222
+ return [
223
+ [i, min([i + chunk_length, input_length])]
224
+ for i in range(
225
+ 0, input_length - overlap_length, chunk_length - overlap_length
226
+ )
227
+ ]
228
+
229
+ def _get_ner_results_for_text(self, text: str) -> List[dict]:
230
+ """The function runs model inference on the provided text.
231
+ The text is split into chunks with n overlapping characters.
232
+ The results are then aggregated and duplications are removed.
233
+
234
+ :param text: The text to run inference on
235
+ :type text: str
236
+ :return: List of entity predictions on the word level
237
+ :rtype: List[dict]
238
+ """
239
+ model_max_length = self.pipeline.tokenizer.model_max_length
240
+ # calculate inputs based on the text
241
+ text_length = len(text)
242
+ # split text into chunks
243
+ if text_length <= model_max_length:
244
+ predictions = self.pipeline(text)
245
+ else:
246
+ logger.info(
247
+ f"splitting the text into chunks, length {text_length} > {model_max_length}"
248
+ )
249
+ predictions = list()
250
+ chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
251
+ text_length, self.chunk_length, self.text_overlap_length
252
+ )
253
+
254
+ # iterate over text chunks and run inference
255
+ for chunk_start, chunk_end in chunk_indexes:
256
+ chunk_text = text[chunk_start:chunk_end]
257
+ chunk_preds = self.pipeline(chunk_text)
258
+
259
+ # align indexes to match the original text - add to each position the value of chunk_start
260
+ aligned_predictions = list()
261
+ for prediction in chunk_preds:
262
+ prediction_tmp = copy.deepcopy(prediction)
263
+ prediction_tmp["start"] += chunk_start
264
+ prediction_tmp["end"] += chunk_start
265
+ aligned_predictions.append(prediction_tmp)
266
+
267
+ predictions.extend(aligned_predictions)
268
+
269
+ # remove duplicates
270
+ predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
271
+ return predictions
272
+
273
+ @staticmethod
274
+ def _convert_to_recognizer_result(
275
+ prediction_result: dict, explanation: AnalysisExplanation
276
+ ) -> RecognizerResult:
277
+ """The method parses NER model predictions into a RecognizerResult format to enable down the stream analysis
278
+
279
+ :param prediction_result: A single example of entity prediction
280
+ :type prediction_result: dict
281
+ :param explanation: Textual representation of model prediction
282
+ :type explanation: str
283
+ :return: An instance of RecognizerResult which is used to model evaluation calculations
284
+ :rtype: RecognizerResult
285
+ """
286
+
287
+ transformers_results = RecognizerResult(
288
+ entity_type=prediction_result["entity_group"],
289
+ start=prediction_result["start"],
290
+ end=prediction_result["end"],
291
+ score=float(round(prediction_result["score"], 2)),
292
+ analysis_explanation=explanation,
293
+ )
294
+
295
+ return transformers_results
296
+
297
+ def build_transformers_explanation(
298
+ self,
299
+ original_score: float,
300
+ explanation: str,
301
+ pattern: str,
302
+ ) -> AnalysisExplanation:
303
+ """
304
+ Create explanation for why this result was detected.
305
+ :param original_score: Score given by this recognizer
306
+ :param explanation: Explanation string
307
+ :param pattern: Regex pattern used
308
+ :return Structured explanation and scores of a NER model prediction
309
+ :rtype: AnalysisExplanation
310
+ """
311
+ explanation = AnalysisExplanation(
312
+ recognizer=self.__class__.__name__,
313
+ original_score=float(original_score),
314
+ textual_explanation=explanation,
315
+ pattern=pattern,
316
+ )
317
+ return explanation
318
+
319
+ def __check_label_transformer(self, label: str) -> Optional[str]:
320
+ """The function validates the predicted label is identified by Presidio
321
+ and maps the string into a Presidio representation
322
+ :param label: Predicted label by the model
323
+ :return: Returns the adjusted entity name
324
+ """
325
+
326
+ # convert model label to presidio label
327
+ entity = self.model_to_presidio_mapping.get(label, None)
328
+
329
+ if entity in self.ignore_labels:
330
+ return None
331
+
332
+ if entity is None:
333
+ logger.warning(f"Found unrecognized label {label}, returning entity as is")
334
+ return label
335
+
336
+ if entity not in self.supported_entities:
337
+ logger.warning(f"Found entity {entity} which is not supported by Presidio")
338
+ return entity
339
+ return entity