presidio commited on
Commit
ec2ae46
1 Parent(s): 57594ac

Delete transformers_rec

Browse files
transformers_rec/__init__.py DELETED
@@ -1,5 +0,0 @@
1
- from .configuration import BERT_DEID_CONFIGURATION, STANFORD_COFIGURATION
2
- from .transformers_recognizer import TransformersRecognizer
3
-
4
- __all__ = ["BERT_DEID_CONFIGURATION", "STANFORD_COFIGURATION", "TransformersRecognizer"]
5
-
 
 
 
 
 
 
transformers_rec/configuration.py DELETED
@@ -1,124 +0,0 @@
1
- ## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py
2
-
3
- STANFORD_COFIGURATION = {
4
- "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
5
- "PRESIDIO_SUPPORTED_ENTITIES": [
6
- "LOCATION",
7
- "PERSON",
8
- "ORGANIZATION",
9
- "AGE",
10
- "PHONE_NUMBER",
11
- "EMAIL",
12
- "DATE_TIME",
13
- "DEVICE",
14
- "ZIP",
15
- "PROFESSION",
16
- "USERNAME",
17
- "ID"
18
-
19
- ],
20
- "LABELS_TO_IGNORE": ["O"],
21
- "DEFAULT_EXPLANATION": "Identified as {} by the StanfordAIMI/stanford-deidentifier-base NER model",
22
- "SUB_WORD_AGGREGATION": "simple",
23
- "DATASET_TO_PRESIDIO_MAPPING": {
24
- "DATE": "DATE_TIME",
25
- "DOCTOR": "PERSON",
26
- "PATIENT": "PERSON",
27
- "HOSPITAL": "LOCATION",
28
- "MEDICALRECORD": "ID",
29
- "IDNUM": "ID",
30
- "ORGANIZATION": "ORGANIZATION",
31
- "ZIP": "ZIP",
32
- "PHONE": "PHONE_NUMBER",
33
- "USERNAME": "USERNAME",
34
- "STREET": "LOCATION",
35
- "PROFESSION": "PROFESSION",
36
- "COUNTRY": "LOCATION",
37
- "LOCATION-OTHER": "LOCATION",
38
- "FAX": "PHONE_NUMBER",
39
- "EMAIL": "EMAIL",
40
- "STATE": "LOCATION",
41
- "DEVICE": "DEVICE",
42
- "ORG": "ORGANIZATION",
43
- "AGE": "AGE",
44
- },
45
- "MODEL_TO_PRESIDIO_MAPPING": {
46
- "PER": "PERSON",
47
- "PERSON": "PERSON",
48
- "LOC": "LOCATION",
49
- "ORG": "ORGANIZATION",
50
- "AGE": "AGE",
51
- "PATIENT": "PERSON",
52
- "HCW": "PERSON",
53
- "HOSPITAL": "LOCATION",
54
- "PATORG": "ORGANIZATION",
55
- "DATE": "DATE_TIME",
56
- "PHONE": "PHONE_NUMBER",
57
- "VENDOR": "ORGANIZATION",
58
- },
59
- "CHUNK_OVERLAP_SIZE": 40,
60
- "CHUNK_SIZE": 600,
61
- "ID_SCORE_MULTIPLIER": 0.4,
62
- "ID_ENTITY_NAME": "ID"
63
- }
64
-
65
-
66
- BERT_DEID_CONFIGURATION = {
67
- "PRESIDIO_SUPPORTED_ENTITIES": [
68
- "LOCATION",
69
- "PERSON",
70
- "ORGANIZATION",
71
- "AGE",
72
- "PHONE_NUMBER",
73
- "EMAIL",
74
- "DATE_TIME",
75
- "ZIP",
76
- "PROFESSION",
77
- "USERNAME",
78
- "ID"
79
- ],
80
- "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
81
- "LABELS_TO_IGNORE": ["O"],
82
- "DEFAULT_EXPLANATION": "Identified as {} by the obi/deid_roberta_i2b2 NER model",
83
- "SUB_WORD_AGGREGATION": "simple",
84
- "DATASET_TO_PRESIDIO_MAPPING": {
85
- "DATE": "DATE_TIME",
86
- "DOCTOR": "PERSON",
87
- "PATIENT": "PERSON",
88
- "HOSPITAL": "ORGANIZATION",
89
- "MEDICALRECORD": "O",
90
- "IDNUM": "O",
91
- "ORGANIZATION": "ORGANIZATION",
92
- "ZIP": "O",
93
- "PHONE": "PHONE_NUMBER",
94
- "USERNAME": "",
95
- "STREET": "LOCATION",
96
- "PROFESSION": "PROFESSION",
97
- "COUNTRY": "LOCATION",
98
- "LOCATION-OTHER": "LOCATION",
99
- "FAX": "PHONE_NUMBER",
100
- "EMAIL": "EMAIL",
101
- "STATE": "LOCATION",
102
- "DEVICE": "O",
103
- "ORG": "ORGANIZATION",
104
- "AGE": "AGE",
105
- },
106
- "MODEL_TO_PRESIDIO_MAPPING": {
107
- "PER": "PERSON",
108
- "LOC": "LOCATION",
109
- "ORG": "ORGANIZATION",
110
- "AGE": "AGE",
111
- "ID": "ID",
112
- "EMAIL": "EMAIL",
113
- "PATIENT": "PERSON",
114
- "STAFF": "PERSON",
115
- "HOSP": "ORGANIZATION",
116
- "PATORG": "ORGANIZATION",
117
- "DATE": "DATE_TIME",
118
- "PHONE": "PHONE_NUMBER",
119
- },
120
- "CHUNK_OVERLAP_SIZE": 40,
121
- "CHUNK_SIZE": 600,
122
- "ID_SCORE_MULTIPLIER": 0.4,
123
- "ID_ENTITY_NAME": "ID"
124
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
transformers_rec/transformers_recognizer.py DELETED
@@ -1,336 +0,0 @@
1
- # Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py
2
-
3
- import copy
4
- import logging
5
- from typing import Optional, List
6
-
7
- import torch
8
- from presidio_analyzer import (
9
- RecognizerResult,
10
- EntityRecognizer,
11
- AnalysisExplanation,
12
- )
13
- from presidio_analyzer.nlp_engine import NlpArtifacts
14
-
15
- from .configuration import BERT_DEID_CONFIGURATION
16
-
17
-
18
- logger = logging.getLogger("presidio-analyzer")
19
-
20
- try:
21
- from transformers import (
22
- AutoTokenizer,
23
- AutoModelForTokenClassification,
24
- pipeline,
25
- TokenClassificationPipeline,
26
- )
27
-
28
- except ImportError:
29
- logger.error("transformers_rec is not installed")
30
-
31
-
32
- class TransformersRecognizer(EntityRecognizer):
33
- """
34
- Wrapper for a transformers_rec model, if needed to be used within Presidio Analyzer.
35
- The class loads models hosted on HuggingFace - https://huggingface.co/
36
- and loads the model and tokenizer into a TokenClassification pipeline.
37
- Samples are split into short text chunks, ideally shorter than max_length input_ids of the individual model,
38
- to avoid truncation by the Tokenizer and loss of information
39
-
40
- A configuration object should be maintained for each dataset-model combination and translate
41
- entities names into a standardized view. A sample of a configuration file is attached in
42
- the example.
43
- :param supported_entities: List of entities to run inference on
44
- :type supported_entities: Optional[List[str]]
45
- :param pipeline: Instance of a TokenClassificationPipeline including a Tokenizer and a Model, defaults to None
46
- :type pipeline: Optional[TokenClassificationPipeline], optional
47
- :param model_path: string referencing a HuggingFace uploaded model to be used for Inference, defaults to None
48
- :type model_path: Optional[str], optional
49
-
50
- :example
51
- >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
52
- >model_path = "obi/deid_roberta_i2b2"
53
- >transformers_recognizer = TransformersRecognizer(model_path=model_path,
54
- >supported_entities = model_configuration.get("PRESIDIO_SUPPORTED_ENTITIES"))
55
- >transformers_recognizer.load_transformer(**model_configuration)
56
- >registry = RecognizerRegistry()
57
- >registry.add_recognizer(transformers_recognizer)
58
- >analyzer = AnalyzerEngine(registry=registry)
59
- >sample = "My name is Christopher and I live in Irbid."
60
- >results = analyzer.analyze(sample, language="en",return_decision_process=True)
61
-
62
- >for result in results:
63
- > print(result,'----', sample[result.start:result.end])
64
- """
65
-
66
- def load(self) -> None:
67
- pass
68
-
69
- def __init__(
70
- self,
71
- model_path: Optional[str] = None,
72
- pipeline: Optional[TokenClassificationPipeline] = None,
73
- supported_entities: Optional[List[str]] = None,
74
- ):
75
- if not supported_entities:
76
- supported_entities = BERT_DEID_CONFIGURATION[
77
- "PRESIDIO_SUPPORTED_ENTITIES"
78
- ]
79
- super().__init__(
80
- supported_entities=supported_entities,
81
- name=f"Transformers model {model_path}",
82
- )
83
-
84
- self.model_path = model_path
85
- self.pipeline = pipeline
86
- self.is_loaded = False
87
-
88
- self.aggregation_mechanism = None
89
- self.ignore_labels = None
90
- self.model_to_presidio_mapping = None
91
- self.entity_mapping = None
92
- self.default_explanation = None
93
- self.text_overlap_length = None
94
- self.chunk_length = None
95
- self.id_entity_name = None
96
- self.id_score_reduction = None
97
-
98
- def load_transformer(self, **kwargs) -> None:
99
- """Load external configuration parameters and set default values.
100
-
101
- :param kwargs: define default values for class attributes and modify pipeline behavior
102
- **DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format
103
- **MODEL_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from chosen model format to Presidio format
104
- **SUB_WORD_AGGREGATION(str) - define how to aggregate sub-word tokens into full words and spans as defined
105
- in HuggingFace https://huggingface.co/transformers/v4.8.0/main_classes/pipelines.html#transformers.TokenClassificationPipeline # noqa
106
- **CHUNK_OVERLAP_SIZE (int) - number of overlapping characters in each text chunk
107
- when splitting a single text into multiple inferences
108
- **CHUNK_SIZE (int) - number of characters in each chunk of text
109
- **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
110
- **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
111
- **ID_ENTITY_NAME (str) - name of the ID entity
112
- **ID_SCORE_REDUCTION (float) - score multiplier for ID entities
113
- """
114
-
115
- self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
116
- self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {})
117
- self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"])
118
- self.aggregation_mechanism = kwargs.get("SUB_WORD_AGGREGATION", "simple")
119
- self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
120
- self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
121
- self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
122
- self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
123
- self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
124
-
125
- if not self.pipeline:
126
- if not self.model_path:
127
- self.model_path = "obi/deid_roberta_i2b2"
128
- logger.warning(
129
- f"Both 'model' and 'model_path' arguments are None. Using default model_path={self.model_path}"
130
- )
131
-
132
- self._load_pipeline()
133
-
134
- def _load_pipeline(self) -> None:
135
- """Initialize NER transformers_rec pipeline using the model_path provided"""
136
-
137
- logging.debug(f"Initializing NER pipeline using {self.model_path} path")
138
- device = 0 if torch.cuda.is_available() else -1
139
- self.pipeline = pipeline(
140
- "ner",
141
- model=AutoModelForTokenClassification.from_pretrained(self.model_path),
142
- tokenizer=AutoTokenizer.from_pretrained(self.model_path),
143
- # Will attempt to group sub-entities to word level
144
- aggregation_strategy=self.aggregation_mechanism,
145
- device=device,
146
- framework="pt",
147
- ignore_labels=self.ignore_labels,
148
- )
149
-
150
- self.is_loaded = True
151
-
152
- def get_supported_entities(self) -> List[str]:
153
- """
154
- Return supported entities by this model.
155
- :return: List of the supported entities.
156
- """
157
- return self.supported_entities
158
-
159
- # Class to use transformers_rec with Presidio as an external recognizer.
160
- def analyze(
161
- self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
162
- ) -> List[RecognizerResult]:
163
- """
164
- Analyze text using transformers_rec model to produce NER tagging.
165
- :param text : The text for analysis.
166
- :param entities: Not working properly for this recognizer.
167
- :param nlp_artifacts: Not used by this recognizer.
168
- :return: The list of Presidio RecognizerResult constructed from the recognized
169
- transformers_rec detections.
170
- """
171
-
172
- results = list()
173
- # Run transformer model on the provided text
174
- ner_results = self._get_ner_results_for_text(text)
175
-
176
- for res in ner_results:
177
- res["entity_group"] = self.__check_label_transformer(res["entity_group"])
178
- if not res["entity_group"]:
179
- continue
180
-
181
- if res["entity_group"] == self.id_entity_name:
182
- print(f"ID entity found, multiplying score by {self.id_score_reduction}")
183
- res["score"] = res["score"] * self.id_score_reduction
184
-
185
- textual_explanation = self.default_explanation.format(res["entity_group"])
186
- explanation = self.build_transformers_explanation(
187
- float(round(res["score"], 2)), textual_explanation, res["word"]
188
- )
189
- transformers_result = self._convert_to_recognizer_result(res, explanation)
190
-
191
- results.append(transformers_result)
192
-
193
- return results
194
-
195
- @staticmethod
196
- def split_text_to_word_chunks(
197
- input_length: int, chunk_length: int, overlap_length: int
198
- ) -> List[List]:
199
- """The function calculates chunks of text with size chunk_length. Each chunk has overlap_length number of
200
- words to create context and continuity for the model
201
-
202
- :param input_length: Length of input_ids for a given text
203
- :type input_length: int
204
- :param chunk_length: Length of each chunk of input_ids.
205
- Should match the max input length of the transformer model
206
- :type chunk_length: int
207
- :param overlap_length: Number of overlapping words in each chunk
208
- :type overlap_length: int
209
- :return: List of start and end positions for individual text chunks
210
- :rtype: List[List]
211
- """
212
- if input_length < chunk_length:
213
- return [[0, input_length]]
214
- if chunk_length <= overlap_length:
215
- logger.warning(
216
- "overlap_length should be shorter than chunk_length, setting overlap_length to by half of chunk_length"
217
- )
218
- overlap_length = chunk_length // 2
219
- return [
220
- [i, min([i + chunk_length, input_length])]
221
- for i in range(
222
- 0, input_length - overlap_length, chunk_length - overlap_length
223
- )
224
- ]
225
-
226
- def _get_ner_results_for_text(self, text: str) -> List[dict]:
227
- """The function runs model inference on the provided text.
228
- The text is split into chunks with n overlapping characters.
229
- The results are then aggregated and duplications are removed.
230
-
231
- :param text: The text to run inference on
232
- :type text: str
233
- :return: List of entity predictions on the word level
234
- :rtype: List[dict]
235
- """
236
- model_max_length = self.pipeline.tokenizer.model_max_length
237
- # calculate inputs based on the text
238
- text_length = len(text)
239
- # split text into chunks
240
- if text_length <= model_max_length:
241
- predictions = self.pipeline(text)
242
- else:
243
- logger.info(
244
- f"splitting the text into chunks, length {text_length} > {model_max_length}"
245
- )
246
- predictions = list()
247
- chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
248
- text_length, self.chunk_length, self.text_overlap_length
249
- )
250
-
251
- # iterate over text chunks and run inference
252
- for chunk_start, chunk_end in chunk_indexes:
253
- chunk_text = text[chunk_start:chunk_end]
254
- chunk_preds = self.pipeline(chunk_text)
255
-
256
- # align indexes to match the original text - add to each position the value of chunk_start
257
- aligned_predictions = list()
258
- for prediction in chunk_preds:
259
- prediction_tmp = copy.deepcopy(prediction)
260
- prediction_tmp["start"] += chunk_start
261
- prediction_tmp["end"] += chunk_start
262
- aligned_predictions.append(prediction_tmp)
263
-
264
- predictions.extend(aligned_predictions)
265
-
266
- # remove duplicates
267
- predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
268
- return predictions
269
-
270
- @staticmethod
271
- def _convert_to_recognizer_result(
272
- prediction_result: dict, explanation: AnalysisExplanation
273
- ) -> RecognizerResult:
274
- """The method parses NER model predictions into a RecognizerResult format to enable down the stream analysis
275
-
276
- :param prediction_result: A single example of entity prediction
277
- :type prediction_result: dict
278
- :param explanation: Textual representation of model prediction
279
- :type explanation: str
280
- :return: An instance of RecognizerResult which is used to model evaluation calculations
281
- :rtype: RecognizerResult
282
- """
283
-
284
- transformers_results = RecognizerResult(
285
- entity_type=prediction_result["entity_group"],
286
- start=prediction_result["start"],
287
- end=prediction_result["end"],
288
- score=float(round(prediction_result["score"], 2)),
289
- analysis_explanation=explanation,
290
- )
291
-
292
- return transformers_results
293
-
294
- def build_transformers_explanation(
295
- self,
296
- original_score: float,
297
- explanation: str,
298
- pattern: str,
299
- ) -> AnalysisExplanation:
300
- """
301
- Create explanation for why this result was detected.
302
- :param original_score: Score given by this recognizer
303
- :param explanation: Explanation string
304
- :param pattern: Regex pattern used
305
- :return Structured explanation and scores of a NER model prediction
306
- :rtype: AnalysisExplanation
307
- """
308
- explanation = AnalysisExplanation(
309
- recognizer=self.__class__.__name__,
310
- original_score=float(original_score),
311
- textual_explanation=explanation,
312
- pattern=pattern,
313
- )
314
- return explanation
315
-
316
- def __check_label_transformer(self, label: str) -> Optional[str]:
317
- """The function validates the predicted label is identified by Presidio
318
- and maps the string into a Presidio representation
319
- :param label: Predicted label by the model
320
- :return: Returns the adjusted entity name
321
- """
322
-
323
- # convert model label to presidio label
324
- entity = self.model_to_presidio_mapping.get(label, None)
325
-
326
- if entity in self.ignore_labels:
327
- return None
328
-
329
- if entity is None:
330
- logger.warning(f"Found unrecognized label {label}, returning entity as is")
331
- return label
332
-
333
- if entity not in self.supported_entities:
334
- logger.warning(f"Found entity {entity} which is not supported by Presidio")
335
- return entity
336
- return entity