presidio commited on
Commit
b7be871
1 Parent(s): 3477655

Upload 3 files

Browse files
transformers_rec/configuration.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  STANFORD_COFIGURATION = {
2
  "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
3
  "PRESIDIO_SUPPORTED_ENTITIES": [
@@ -11,7 +13,8 @@ STANFORD_COFIGURATION = {
11
  "DEVICE",
12
  "ZIP",
13
  "PROFESSION",
14
- "USERNAME"
 
15
 
16
  ],
17
  "LABELS_TO_IGNORE": ["O"],
@@ -22,8 +25,8 @@ STANFORD_COFIGURATION = {
22
  "DOCTOR": "PERSON",
23
  "PATIENT": "PERSON",
24
  "HOSPITAL": "LOCATION",
25
- "MEDICALRECORD": "O",
26
- "IDNUM": "O",
27
  "ORGANIZATION": "ORGANIZATION",
28
  "ZIP": "ZIP",
29
  "PHONE": "PHONE_NUMBER",
@@ -55,6 +58,8 @@ STANFORD_COFIGURATION = {
55
  },
56
  "CHUNK_OVERLAP_SIZE": 40,
57
  "CHUNK_SIZE": 600,
 
 
58
  }
59
 
60
 
@@ -70,6 +75,7 @@ BERT_DEID_CONFIGURATION = {
70
  "ZIP",
71
  "PROFESSION",
72
  "USERNAME",
 
73
  ],
74
  "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
75
  "LABELS_TO_IGNORE": ["O"],
@@ -102,7 +108,7 @@ BERT_DEID_CONFIGURATION = {
102
  "LOC": "LOCATION",
103
  "ORG": "ORGANIZATION",
104
  "AGE": "AGE",
105
- "ID": "O",
106
  "EMAIL": "EMAIL",
107
  "PATIENT": "PERSON",
108
  "STAFF": "PERSON",
@@ -113,4 +119,6 @@ BERT_DEID_CONFIGURATION = {
113
  },
114
  "CHUNK_OVERLAP_SIZE": 40,
115
  "CHUNK_SIZE": 600,
 
 
116
  }
 
1
+ ## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py
2
+
3
  STANFORD_COFIGURATION = {
4
  "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
5
  "PRESIDIO_SUPPORTED_ENTITIES": [
 
13
  "DEVICE",
14
  "ZIP",
15
  "PROFESSION",
16
+ "USERNAME",
17
+ "ID"
18
 
19
  ],
20
  "LABELS_TO_IGNORE": ["O"],
 
25
  "DOCTOR": "PERSON",
26
  "PATIENT": "PERSON",
27
  "HOSPITAL": "LOCATION",
28
+ "MEDICALRECORD": "ID",
29
+ "IDNUM": "ID",
30
  "ORGANIZATION": "ORGANIZATION",
31
  "ZIP": "ZIP",
32
  "PHONE": "PHONE_NUMBER",
 
58
  },
59
  "CHUNK_OVERLAP_SIZE": 40,
60
  "CHUNK_SIZE": 600,
61
+ "ID_SCORE_MULTIPLIER": 0.4,
62
+ "ID_ENTITY_NAME": "ID"
63
  }
64
 
65
 
 
75
  "ZIP",
76
  "PROFESSION",
77
  "USERNAME",
78
+ "ID"
79
  ],
80
  "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
81
  "LABELS_TO_IGNORE": ["O"],
 
108
  "LOC": "LOCATION",
109
  "ORG": "ORGANIZATION",
110
  "AGE": "AGE",
111
+ "ID": "ID",
112
  "EMAIL": "EMAIL",
113
  "PATIENT": "PERSON",
114
  "STAFF": "PERSON",
 
119
  },
120
  "CHUNK_OVERLAP_SIZE": 40,
121
  "CHUNK_SIZE": 600,
122
+ "ID_SCORE_MULTIPLIER": 0.4,
123
+ "ID_ENTITY_NAME": "ID"
124
  }
transformers_rec/transformers_recognizer.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import copy
2
  import logging
3
  from typing import Optional, List
@@ -90,6 +92,8 @@ class TransformersRecognizer(EntityRecognizer):
90
  self.default_explanation = None
91
  self.text_overlap_length = None
92
  self.chunk_length = None
 
 
93
 
94
  def load_transformer(self, **kwargs) -> None:
95
  """Load external configuration parameters and set default values.
@@ -104,6 +108,8 @@ class TransformersRecognizer(EntityRecognizer):
104
  **CHUNK_SIZE (int) - number of characters in each chunk of text
105
  **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
106
  **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
 
 
107
  """
108
 
109
  self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
@@ -113,6 +119,9 @@ class TransformersRecognizer(EntityRecognizer):
113
  self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
114
  self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
115
  self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
 
 
 
116
  if not self.pipeline:
117
  if not self.model_path:
118
  self.model_path = "obi/deid_roberta_i2b2"
@@ -165,11 +174,14 @@ class TransformersRecognizer(EntityRecognizer):
165
  ner_results = self._get_ner_results_for_text(text)
166
 
167
  for res in ner_results:
168
- entity = self.model_to_presidio_mapping.get(res["entity_group"], None)
169
- if not entity:
170
  continue
171
 
172
- res["entity_group"] = self.__check_label_transformer(res["entity_group"])
 
 
 
173
  textual_explanation = self.default_explanation.format(res["entity_group"])
174
  explanation = self.build_transformers_explanation(
175
  float(round(res["score"], 2)), textual_explanation, res["word"]
@@ -224,33 +236,32 @@ class TransformersRecognizer(EntityRecognizer):
224
  model_max_length = self.pipeline.tokenizer.model_max_length
225
  # calculate inputs based on the text
226
  text_length = len(text)
227
- predictions = list()
228
- if text_length > model_max_length*2:
229
- # split text into chunks
 
230
  logger.info(
231
- f"splitting the text into chunks, length {text_length} > {model_max_length*2}"
232
  )
233
-
234
  chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
235
  text_length, self.chunk_length, self.text_overlap_length
236
- )
237
- else:
238
- chunk_indexes = [[0, text_length]]
239
 
240
- # iterate over text chunks and run inference
241
- for chunk_start, chunk_end in chunk_indexes:
242
- chunk_text = text[chunk_start:chunk_end]
243
- chunk_preds = self.pipeline(chunk_text)
244
 
245
- # align indexes to match the original text - add to each position the value of chunk_start
246
- aligned_predictions = list()
247
- for prediction in chunk_preds:
248
- prediction_tmp = copy.deepcopy(prediction)
249
- prediction_tmp["start"] += chunk_start
250
- prediction_tmp["end"] += chunk_start
251
- aligned_predictions.append(prediction_tmp)
252
 
253
- predictions.extend(aligned_predictions)
254
 
255
  # remove duplicates
256
  predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
@@ -302,27 +313,24 @@ class TransformersRecognizer(EntityRecognizer):
302
  )
303
  return explanation
304
 
305
- def __check_label_transformer(self, label: str) -> str:
306
  """The function validates the predicted label is identified by Presidio
307
  and maps the string into a Presidio representation
308
  :param label: Predicted label by the model
309
- :type label: str
310
- :return: Returns the predicted entity if the label is found in model_to_presidio mapping dictionary
311
- and is supported by Presidio entities
312
- :rtype: str
313
  """
314
 
315
- if label == "O":
316
- return label
317
-
318
  # convert model label to presidio label
319
  entity = self.model_to_presidio_mapping.get(label, None)
320
 
 
 
 
321
  if entity is None:
322
- logger.warning(f"Found unrecognized label {label}, returning entity as 'O'")
323
- return "O"
324
 
325
  if entity not in self.supported_entities:
326
  logger.warning(f"Found entity {entity} which is not supported by Presidio")
327
- return "O"
328
  return entity
 
1
+ # Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py
2
+
3
  import copy
4
  import logging
5
  from typing import Optional, List
 
92
  self.default_explanation = None
93
  self.text_overlap_length = None
94
  self.chunk_length = None
95
+ self.id_entity_name = None
96
+ self.id_score_reduction = None
97
 
98
  def load_transformer(self, **kwargs) -> None:
99
  """Load external configuration parameters and set default values.
 
108
  **CHUNK_SIZE (int) - number of characters in each chunk of text
109
  **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
110
  **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
111
+ **ID_ENTITY_NAME (str) - name of the ID entity
112
+ **ID_SCORE_REDUCTION (float) - score multiplier for ID entities
113
  """
114
 
115
  self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
 
119
  self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
120
  self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
121
  self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
122
+ self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
123
+ self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
124
+
125
  if not self.pipeline:
126
  if not self.model_path:
127
  self.model_path = "obi/deid_roberta_i2b2"
 
174
  ner_results = self._get_ner_results_for_text(text)
175
 
176
  for res in ner_results:
177
+ res["entity_group"] = self.__check_label_transformer(res["entity_group"])
178
+ if not res["entity_group"]:
179
  continue
180
 
181
+ if res["entity_group"] == self.id_entity_name:
182
+ print(f"ID entity found, multiplying score by {self.id_score_reduction}")
183
+ res["score"] = res["score"] * self.id_score_reduction
184
+
185
  textual_explanation = self.default_explanation.format(res["entity_group"])
186
  explanation = self.build_transformers_explanation(
187
  float(round(res["score"], 2)), textual_explanation, res["word"]
 
236
  model_max_length = self.pipeline.tokenizer.model_max_length
237
  # calculate inputs based on the text
238
  text_length = len(text)
239
+ # split text into chunks
240
+ if text_length <= model_max_length:
241
+ predictions = self.pipeline(text)
242
+ else:
243
  logger.info(
244
+ f"splitting the text into chunks, length {text_length} > {model_max_length}"
245
  )
246
+ predictions = list()
247
  chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
248
  text_length, self.chunk_length, self.text_overlap_length
249
+ )
 
 
250
 
251
+ # iterate over text chunks and run inference
252
+ for chunk_start, chunk_end in chunk_indexes:
253
+ chunk_text = text[chunk_start:chunk_end]
254
+ chunk_preds = self.pipeline(chunk_text)
255
 
256
+ # align indexes to match the original text - add to each position the value of chunk_start
257
+ aligned_predictions = list()
258
+ for prediction in chunk_preds:
259
+ prediction_tmp = copy.deepcopy(prediction)
260
+ prediction_tmp["start"] += chunk_start
261
+ prediction_tmp["end"] += chunk_start
262
+ aligned_predictions.append(prediction_tmp)
263
 
264
+ predictions.extend(aligned_predictions)
265
 
266
  # remove duplicates
267
  predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
 
313
  )
314
  return explanation
315
 
316
+ def __check_label_transformer(self, label: str) -> Optional[str]:
317
  """The function validates the predicted label is identified by Presidio
318
  and maps the string into a Presidio representation
319
  :param label: Predicted label by the model
320
+ :return: Returns the adjusted entity name
 
 
 
321
  """
322
 
 
 
 
323
  # convert model label to presidio label
324
  entity = self.model_to_presidio_mapping.get(label, None)
325
 
326
+ if entity in self.ignore_labels:
327
+ return None
328
+
329
  if entity is None:
330
+ logger.warning(f"Found unrecognized label {label}, returning entity as is")
331
+ return label
332
 
333
  if entity not in self.supported_entities:
334
  logger.warning(f"Found entity {entity} which is not supported by Presidio")
335
+ return entity
336
  return entity