petrsovadina commited on
Commit
9224c4e
1 Parent(s): 2cac169

Update presidio_nlp_engine_config.py

Browse files
Files changed (1) hide show
  1. presidio_nlp_engine_config.py +41 -61
presidio_nlp_engine_config.py CHANGED
@@ -10,6 +10,12 @@ from presidio_analyzer.nlp_engine import (
10
 
11
  logger = logging.getLogger("presidio-streamlit")
12
 
 
 
 
 
 
 
13
 
14
  def create_nlp_engine_with_spacy(
15
  model_path: str,
@@ -20,23 +26,20 @@ def create_nlp_engine_with_spacy(
20
  """
21
  nlp_configuration = {
22
  "nlp_engine_name": "spacy",
23
- "models": [{"lang_code": "en", "model_name": model_path}],
24
  "ner_model_configuration": {
25
  "model_to_presidio_entity_mapping": {
26
- "PER": "PERSON",
27
- "PERSON": "PERSON",
28
- "NORP": "NRP",
29
- "FAC": "FACILITY",
30
- "LOC": "LOCATION",
31
- "GPE": "LOCATION",
32
- "LOCATION": "LOCATION",
33
- "ORG": "ORGANIZATION",
34
- "ORGANIZATION": "ORGANIZATION",
35
- "DATE": "DATE_TIME",
36
- "TIME": "DATE_TIME",
37
  },
38
  "low_confidence_score_multiplier": 0.4,
39
- "low_score_entity_names": ["ORG", "ORGANIZATION"],
40
  },
41
  }
42
 
@@ -47,7 +50,6 @@ def create_nlp_engine_with_spacy(
47
 
48
  return nlp_engine, registry
49
 
50
-
51
  def create_nlp_engine_with_stanza(
52
  model_path: str,
53
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
@@ -57,20 +59,15 @@ def create_nlp_engine_with_stanza(
57
  """
58
  nlp_configuration = {
59
  "nlp_engine_name": "stanza",
60
- "models": [{"lang_code": "en", "model_name": model_path}],
61
  "ner_model_configuration": {
62
  "model_to_presidio_entity_mapping": {
63
- "PER": "PERSON",
64
- "PERSON": "PERSON",
65
- "NORP": "NRP",
66
- "FAC": "FACILITY",
67
- "LOC": "LOCATION",
68
- "GPE": "LOCATION",
69
- "LOCATION": "LOCATION",
70
- "ORG": "ORGANIZATION",
71
- "ORGANIZATION": "ORGANIZATION",
72
- "DATE": "DATE_TIME",
73
- "TIME": "DATE_TIME",
74
  }
75
  },
76
  }
@@ -82,7 +79,6 @@ def create_nlp_engine_with_stanza(
82
 
83
  return nlp_engine, registry
84
 
85
-
86
  def create_nlp_engine_with_transformers(
87
  model_path: str,
88
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
@@ -92,44 +88,32 @@ def create_nlp_engine_with_transformers(
92
  would return NlpArtifacts such as POS and lemmas.
93
  :param model_path: HuggingFace model path.
94
  """
95
- print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
96
 
97
  nlp_configuration = {
98
  "nlp_engine_name": "transformers",
99
  "models": [
100
  {
101
- "lang_code": "en",
102
- "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
103
  }
104
  ],
105
  "ner_model_configuration": {
106
  "model_to_presidio_entity_mapping": {
107
- "PER": "PERSON",
108
- "PERSON": "PERSON",
109
- "LOC": "LOCATION",
110
- "LOCATION": "LOCATION",
111
- "GPE": "LOCATION",
112
- "ORG": "ORGANIZATION",
113
- "ORGANIZATION": "ORGANIZATION",
114
- "NORP": "NRP",
115
- "AGE": "AGE",
116
- "ID": "ID",
117
  "EMAIL": "EMAIL",
118
- "PATIENT": "PERSON",
119
- "STAFF": "PERSON",
120
- "HOSP": "ORGANIZATION",
121
- "PATORG": "ORGANIZATION",
122
- "DATE": "DATE_TIME",
123
- "TIME": "DATE_TIME",
124
- "PHONE": "PHONE_NUMBER",
125
- "HCW": "PERSON",
126
- "HOSPITAL": "ORGANIZATION",
127
- "FACILITY": "LOCATION",
128
  },
129
  "low_confidence_score_multiplier": 0.4,
130
  "low_score_entity_names": ["ID"],
131
  "labels_to_ignore": [
132
- "CARDINAL",
133
  "EVENT",
134
  "LANGUAGE",
135
  "LAW",
@@ -150,7 +134,6 @@ def create_nlp_engine_with_transformers(
150
 
151
  return nlp_engine, registry
152
 
153
-
154
  def create_nlp_engine_with_flair(
155
  model_path: str,
156
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
@@ -165,15 +148,13 @@ def create_nlp_engine_with_flair(
165
  registry = RecognizerRegistry()
166
  registry.load_predefined_recognizers()
167
 
168
- # there is no official Flair NlpEngine, hence we load it as an additional recognizer
169
-
170
- if not spacy.util.is_package("en_core_web_sm"):
171
- spacy.cli.download("en_core_web_sm")
172
- # Using a small spaCy model + a Flair NER model
173
  flair_recognizer = FlairRecognizer(model_path=model_path)
174
  nlp_configuration = {
175
  "nlp_engine_name": "spacy",
176
- "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
177
  }
178
  registry.add_recognizer(flair_recognizer)
179
  registry.remove_recognizer("SpacyRecognizer")
@@ -182,7 +163,6 @@ def create_nlp_engine_with_flair(
182
 
183
  return nlp_engine, registry
184
 
185
-
186
  def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
187
  """
188
  Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
@@ -204,7 +184,7 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
204
  )
205
  nlp_configuration = {
206
  "nlp_engine_name": "spacy",
207
- "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
208
  }
209
 
210
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
@@ -212,4 +192,4 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
212
  registry.add_recognizer(azure_ai_language_recognizer)
213
  registry.remove_recognizer("SpacyRecognizer")
214
 
215
- return nlp_engine, registry
 
10
 
11
  logger = logging.getLogger("presidio-streamlit")
12
 
13
+ CZECH_ENTITY_MAPPING = {
14
+ "PER": "OSOBA",
15
+ "LOC": "ADRESA",
16
+ "ORG": "ORGANIZACE",
17
+ "MISC": "RŮZNÉ",
18
+ }
19
 
20
  def create_nlp_engine_with_spacy(
21
  model_path: str,
 
26
  """
27
  nlp_configuration = {
28
  "nlp_engine_name": "spacy",
29
+ "models": [{"lang_code": "cs", "model_name": model_path}],
30
  "ner_model_configuration": {
31
  "model_to_presidio_entity_mapping": {
32
+ **CZECH_ENTITY_MAPPING,
33
+ "PERSON": "OSOBA",
34
+ "GPE": "ADRESA",
35
+ "LOCATION": "ADRESA",
36
+ "ORGANIZATION": "ORGANIZACE",
37
+ "DATE": "DATUM_NAROZENÍ",
38
+ "CARDINAL": "RODNÉ_ČÍSLO",
39
+ "ORG": "IČO",
 
 
 
40
  },
41
  "low_confidence_score_multiplier": 0.4,
42
+ "low_score_entity_names": ["ORG", "ORGANIZACE"],
43
  },
44
  }
45
 
 
50
 
51
  return nlp_engine, registry
52
 
 
53
  def create_nlp_engine_with_stanza(
54
  model_path: str,
55
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
 
59
  """
60
  nlp_configuration = {
61
  "nlp_engine_name": "stanza",
62
+ "models": [{"lang_code": "cs", "model_name": model_path}],
63
  "ner_model_configuration": {
64
  "model_to_presidio_entity_mapping": {
65
+ **CZECH_ENTITY_MAPPING,
66
+ "PERSON": "OSOBA",
67
+ "GPE": "ADRESA",
68
+ "LOCATION": "ADRESA",
69
+ "ORGANIZATION": "ORGANIZACE",
70
+ "DATE": "DATUM_NAROZENÍ",
 
 
 
 
 
71
  }
72
  },
73
  }
 
79
 
80
  return nlp_engine, registry
81
 
 
82
  def create_nlp_engine_with_transformers(
83
  model_path: str,
84
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
 
88
  would return NlpArtifacts such as POS and lemmas.
89
  :param model_path: HuggingFace model path.
90
  """
91
+ logger.info(f"Loading Transformers model: {model_path}")
92
 
93
  nlp_configuration = {
94
  "nlp_engine_name": "transformers",
95
  "models": [
96
  {
97
+ "lang_code": "cs",
98
+ "model_name": {"spacy": "cs_core_news_sm", "transformers": model_path},
99
  }
100
  ],
101
  "ner_model_configuration": {
102
  "model_to_presidio_entity_mapping": {
103
+ **CZECH_ENTITY_MAPPING,
104
+ "PERSON": "OSOBA",
105
+ "LOC": "ADRESA",
106
+ "GPE": "ADRESA",
107
+ "ORG": "ORGANIZACE",
108
+ "DATE": "DATUM_NAROZENÍ",
109
+ "CARDINAL": "RODNÉ_ČÍSLO",
110
+ "ID": "IČO",
 
 
111
  "EMAIL": "EMAIL",
112
+ "PHONE": "TELEFON",
 
 
 
 
 
 
 
 
 
113
  },
114
  "low_confidence_score_multiplier": 0.4,
115
  "low_score_entity_names": ["ID"],
116
  "labels_to_ignore": [
 
117
  "EVENT",
118
  "LANGUAGE",
119
  "LAW",
 
134
 
135
  return nlp_engine, registry
136
 
 
137
  def create_nlp_engine_with_flair(
138
  model_path: str,
139
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
 
148
  registry = RecognizerRegistry()
149
  registry.load_predefined_recognizers()
150
 
151
+ if not spacy.util.is_package("cs_core_news_sm"):
152
+ spacy.cli.download("cs_core_news_sm")
153
+
 
 
154
  flair_recognizer = FlairRecognizer(model_path=model_path)
155
  nlp_configuration = {
156
  "nlp_engine_name": "spacy",
157
+ "models": [{"lang_code": "cs", "model_name": "cs_core_news_sm"}],
158
  }
159
  registry.add_recognizer(flair_recognizer)
160
  registry.remove_recognizer("SpacyRecognizer")
 
163
 
164
  return nlp_engine, registry
165
 
 
166
  def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
167
  """
168
  Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
 
184
  )
185
  nlp_configuration = {
186
  "nlp_engine_name": "spacy",
187
+ "models": [{"lang_code": "cs", "model_name": "cs_core_news_sm"}],
188
  }
189
 
190
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
 
192
  registry.add_recognizer(azure_ai_language_recognizer)
193
  registry.remove_recognizer("SpacyRecognizer")
194
 
195
+ return nlp_engine, registry