petrsovadina commited on
Commit
e1413ef
1 Parent(s): d5530be

Update presidio_nlp_engine_config.py

Browse files
Files changed (1) hide show
  1. presidio_nlp_engine_config.py +38 -38
presidio_nlp_engine_config.py CHANGED
@@ -10,11 +10,11 @@ from presidio_analyzer.nlp_engine import (
10
 
11
  logger = logging.getLogger("presidio-streamlit")
12
 
13
- CZECH_ENTITY_MAPPING = {
14
- "PER": "OSOBA",
15
- "LOC": "ADRESA",
16
- "ORG": "ORGANIZACE",
17
- "MISC": "RŮZNÉ",
18
  }
19
 
20
  def create_nlp_engine_with_spacy(
@@ -26,20 +26,20 @@ def create_nlp_engine_with_spacy(
26
  """
27
  nlp_configuration = {
28
  "nlp_engine_name": "spacy",
29
- "models": [{"lang_code": "cs", "model_name": model_path}],
30
  "ner_model_configuration": {
31
  "model_to_presidio_entity_mapping": {
32
- **CZECH_ENTITY_MAPPING,
33
- "PERSON": "OSOBA",
34
- "GPE": "ADRESA",
35
- "LOCATION": "ADRESA",
36
- "ORGANIZATION": "ORGANIZACE",
37
- "DATE": "DATUM_NAROZENÍ",
38
- "CARDINAL": "RODNÉ_ČÍSLO",
39
- "ORG": "IČO",
40
  },
41
  "low_confidence_score_multiplier": 0.4,
42
- "low_score_entity_names": ["ORG", "ORGANIZACE"],
43
  },
44
  }
45
 
@@ -59,15 +59,15 @@ def create_nlp_engine_with_stanza(
59
  """
60
  nlp_configuration = {
61
  "nlp_engine_name": "stanza",
62
- "models": [{"lang_code": "cs", "model_name": model_path}],
63
  "ner_model_configuration": {
64
  "model_to_presidio_entity_mapping": {
65
- **CZECH_ENTITY_MAPPING,
66
- "PERSON": "OSOBA",
67
- "GPE": "ADRESA",
68
- "LOCATION": "ADRESA",
69
- "ORGANIZATION": "ORGANIZACE",
70
- "DATE": "DATUM_NAROZENÍ",
71
  }
72
  },
73
  }
@@ -94,22 +94,22 @@ def create_nlp_engine_with_transformers(
94
  "nlp_engine_name": "transformers",
95
  "models": [
96
  {
97
- "lang_code": "cs",
98
- "model_name": {"spacy": "cs_core_news_sm", "transformers": model_path},
99
  }
100
  ],
101
  "ner_model_configuration": {
102
  "model_to_presidio_entity_mapping": {
103
- **CZECH_ENTITY_MAPPING,
104
- "PERSON": "OSOBA",
105
- "LOC": "ADRESA",
106
- "GPE": "ADRESA",
107
- "ORG": "ORGANIZACE",
108
- "DATE": "DATUM_NAROZENÍ",
109
- "CARDINAL": "RODNÉ_ČÍSLO",
110
- "ID": "IČO",
111
  "EMAIL": "EMAIL",
112
- "PHONE": "TELEFON",
113
  },
114
  "low_confidence_score_multiplier": 0.4,
115
  "low_score_entity_names": ["ID"],
@@ -148,13 +148,13 @@ def create_nlp_engine_with_flair(
148
  registry = RecognizerRegistry()
149
  registry.load_predefined_recognizers()
150
 
151
- if not spacy.util.is_package("cs_core_news_sm"):
152
- spacy.cli.download("cs_core_news_sm")
153
 
154
  flair_recognizer = FlairRecognizer(model_path=model_path)
155
  nlp_configuration = {
156
  "nlp_engine_name": "spacy",
157
- "models": [{"lang_code": "cs", "model_name": "cs_core_news_sm"}],
158
  }
159
  registry.add_recognizer(flair_recognizer)
160
  registry.remove_recognizer("SpacyRecognizer")
@@ -184,7 +184,7 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
184
  )
185
  nlp_configuration = {
186
  "nlp_engine_name": "spacy",
187
- "models": [{"lang_code": "cs", "model_name": "cs_core_news_sm"}],
188
  }
189
 
190
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
@@ -192,4 +192,4 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
192
  registry.add_recognizer(azure_ai_language_recognizer)
193
  registry.remove_recognizer("SpacyRecognizer")
194
 
195
- return nlp_engine, registry
 
10
 
11
  logger = logging.getLogger("presidio-streamlit")
12
 
13
+ ENGLISH_ENTITY_MAPPING = {
14
+ "PER": "PERSON",
15
+ "LOC": "ADDRESS",
16
+ "ORG": "ORGANIZATION",
17
+ "MISC": "MISC",
18
  }
19
 
20
  def create_nlp_engine_with_spacy(
 
26
  """
27
  nlp_configuration = {
28
  "nlp_engine_name": "spacy",
29
+ "models": [{"lang_code": "en", "model_name": model_path}],
30
  "ner_model_configuration": {
31
  "model_to_presidio_entity_mapping": {
32
+ **ENGLISH_ENTITY_MAPPING,
33
+ "PERSON": "PERSON",
34
+ "GPE": "ADDRESS",
35
+ "LOCATION": "ADDRESS",
36
+ "ORGANIZATION": "ORGANIZATION",
37
+ "DATE": "DATE_OF_BIRTH",
38
+ "CARDINAL": "NATIONAL_ID_NUMBER",
39
+ "ORG": "COMPANY_ID",
40
  },
41
  "low_confidence_score_multiplier": 0.4,
42
+ "low_score_entity_names": ["ORG", "ORGANIZATION"],
43
  },
44
  }
45
 
 
59
  """
60
  nlp_configuration = {
61
  "nlp_engine_name": "stanza",
62
+ "models": [{"lang_code": "en", "model_name": model_path}],
63
  "ner_model_configuration": {
64
  "model_to_presidio_entity_mapping": {
65
+ **ENGLISH_ENTITY_MAPPING,
66
+ "PERSON": "PERSON",
67
+ "GPE": "ADDRESS",
68
+ "LOCATION": "ADDRESS",
69
+ "ORGANIZATION": "ORGANIZATION",
70
+ "DATE": "DATE_OF_BIRTH",
71
  }
72
  },
73
  }
 
94
  "nlp_engine_name": "transformers",
95
  "models": [
96
  {
97
+ "lang_code": "en",
98
+ "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
99
  }
100
  ],
101
  "ner_model_configuration": {
102
  "model_to_presidio_entity_mapping": {
103
+ **ENGLISH_ENTITY_MAPPING,
104
+ "PERSON": "PERSON",
105
+ "LOC": "ADDRESS",
106
+ "GPE": "ADDRESS",
107
+ "ORG": "ORGANIZATION",
108
+ "DATE": "DATE_OF_BIRTH",
109
+ "CARDINAL": "NATIONAL_ID_NUMBER",
110
+ "ID": "COMPANY_ID",
111
  "EMAIL": "EMAIL",
112
+ "PHONE": "PHONE",
113
  },
114
  "low_confidence_score_multiplier": 0.4,
115
  "low_score_entity_names": ["ID"],
 
148
  registry = RecognizerRegistry()
149
  registry.load_predefined_recognizers()
150
 
151
+ if not spacy.util.is_package("en_core_web_sm"):
152
+ spacy.cli.download("en_core_web_sm")
153
 
154
  flair_recognizer = FlairRecognizer(model_path=model_path)
155
  nlp_configuration = {
156
  "nlp_engine_name": "spacy",
157
+ "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
158
  }
159
  registry.add_recognizer(flair_recognizer)
160
  registry.remove_recognizer("SpacyRecognizer")
 
184
  )
185
  nlp_configuration = {
186
  "nlp_engine_name": "spacy",
187
+ "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
188
  }
189
 
190
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
 
192
  registry.add_recognizer(azure_ai_language_recognizer)
193
  registry.remove_recognizer("SpacyRecognizer")
194
 
195
+ return nlp_engine, registry