Spaces:
Running
Running
petrsovadina
commited on
Commit
•
e1413ef
1
Parent(s):
d5530be
Update presidio_nlp_engine_config.py
Browse files- presidio_nlp_engine_config.py +38 -38
presidio_nlp_engine_config.py
CHANGED
@@ -10,11 +10,11 @@ from presidio_analyzer.nlp_engine import (
|
|
10 |
|
11 |
logger = logging.getLogger("presidio-streamlit")
|
12 |
|
13 |
-
|
14 |
-
"PER": "
|
15 |
-
"LOC": "
|
16 |
-
"ORG": "
|
17 |
-
"MISC": "
|
18 |
}
|
19 |
|
20 |
def create_nlp_engine_with_spacy(
|
@@ -26,20 +26,20 @@ def create_nlp_engine_with_spacy(
|
|
26 |
"""
|
27 |
nlp_configuration = {
|
28 |
"nlp_engine_name": "spacy",
|
29 |
-
"models": [{"lang_code": "
|
30 |
"ner_model_configuration": {
|
31 |
"model_to_presidio_entity_mapping": {
|
32 |
-
**
|
33 |
-
"PERSON": "
|
34 |
-
"GPE": "
|
35 |
-
"LOCATION": "
|
36 |
-
"ORGANIZATION": "
|
37 |
-
"DATE": "
|
38 |
-
"CARDINAL": "
|
39 |
-
"ORG": "
|
40 |
},
|
41 |
"low_confidence_score_multiplier": 0.4,
|
42 |
-
"low_score_entity_names": ["ORG", "
|
43 |
},
|
44 |
}
|
45 |
|
@@ -59,15 +59,15 @@ def create_nlp_engine_with_stanza(
|
|
59 |
"""
|
60 |
nlp_configuration = {
|
61 |
"nlp_engine_name": "stanza",
|
62 |
-
"models": [{"lang_code": "
|
63 |
"ner_model_configuration": {
|
64 |
"model_to_presidio_entity_mapping": {
|
65 |
-
**
|
66 |
-
"PERSON": "
|
67 |
-
"GPE": "
|
68 |
-
"LOCATION": "
|
69 |
-
"ORGANIZATION": "
|
70 |
-
"DATE": "
|
71 |
}
|
72 |
},
|
73 |
}
|
@@ -94,22 +94,22 @@ def create_nlp_engine_with_transformers(
|
|
94 |
"nlp_engine_name": "transformers",
|
95 |
"models": [
|
96 |
{
|
97 |
-
"lang_code": "
|
98 |
-
"model_name": {"spacy": "
|
99 |
}
|
100 |
],
|
101 |
"ner_model_configuration": {
|
102 |
"model_to_presidio_entity_mapping": {
|
103 |
-
**
|
104 |
-
"PERSON": "
|
105 |
-
"LOC": "
|
106 |
-
"GPE": "
|
107 |
-
"ORG": "
|
108 |
-
"DATE": "
|
109 |
-
"CARDINAL": "
|
110 |
-
"ID": "
|
111 |
"EMAIL": "EMAIL",
|
112 |
-
"PHONE": "
|
113 |
},
|
114 |
"low_confidence_score_multiplier": 0.4,
|
115 |
"low_score_entity_names": ["ID"],
|
@@ -148,13 +148,13 @@ def create_nlp_engine_with_flair(
|
|
148 |
registry = RecognizerRegistry()
|
149 |
registry.load_predefined_recognizers()
|
150 |
|
151 |
-
if not spacy.util.is_package("
|
152 |
-
spacy.cli.download("
|
153 |
|
154 |
flair_recognizer = FlairRecognizer(model_path=model_path)
|
155 |
nlp_configuration = {
|
156 |
"nlp_engine_name": "spacy",
|
157 |
-
"models": [{"lang_code": "
|
158 |
}
|
159 |
registry.add_recognizer(flair_recognizer)
|
160 |
registry.remove_recognizer("SpacyRecognizer")
|
@@ -184,7 +184,7 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
|
|
184 |
)
|
185 |
nlp_configuration = {
|
186 |
"nlp_engine_name": "spacy",
|
187 |
-
"models": [{"lang_code": "
|
188 |
}
|
189 |
|
190 |
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
@@ -192,4 +192,4 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
|
|
192 |
registry.add_recognizer(azure_ai_language_recognizer)
|
193 |
registry.remove_recognizer("SpacyRecognizer")
|
194 |
|
195 |
-
return nlp_engine, registry
|
|
|
10 |
|
11 |
logger = logging.getLogger("presidio-streamlit")
|
12 |
|
13 |
+
ENGLISH_ENTITY_MAPPING = {
|
14 |
+
"PER": "PERSON",
|
15 |
+
"LOC": "ADDRESS",
|
16 |
+
"ORG": "ORGANIZATION",
|
17 |
+
"MISC": "MISC",
|
18 |
}
|
19 |
|
20 |
def create_nlp_engine_with_spacy(
|
|
|
26 |
"""
|
27 |
nlp_configuration = {
|
28 |
"nlp_engine_name": "spacy",
|
29 |
+
"models": [{"lang_code": "en", "model_name": model_path}],
|
30 |
"ner_model_configuration": {
|
31 |
"model_to_presidio_entity_mapping": {
|
32 |
+
**ENGLISH_ENTITY_MAPPING,
|
33 |
+
"PERSON": "PERSON",
|
34 |
+
"GPE": "ADDRESS",
|
35 |
+
"LOCATION": "ADDRESS",
|
36 |
+
"ORGANIZATION": "ORGANIZATION",
|
37 |
+
"DATE": "DATE_OF_BIRTH",
|
38 |
+
"CARDINAL": "NATIONAL_ID_NUMBER",
|
39 |
+
"ORG": "COMPANY_ID",
|
40 |
},
|
41 |
"low_confidence_score_multiplier": 0.4,
|
42 |
+
"low_score_entity_names": ["ORG", "ORGANIZATION"],
|
43 |
},
|
44 |
}
|
45 |
|
|
|
59 |
"""
|
60 |
nlp_configuration = {
|
61 |
"nlp_engine_name": "stanza",
|
62 |
+
"models": [{"lang_code": "en", "model_name": model_path}],
|
63 |
"ner_model_configuration": {
|
64 |
"model_to_presidio_entity_mapping": {
|
65 |
+
**ENGLISH_ENTITY_MAPPING,
|
66 |
+
"PERSON": "PERSON",
|
67 |
+
"GPE": "ADDRESS",
|
68 |
+
"LOCATION": "ADDRESS",
|
69 |
+
"ORGANIZATION": "ORGANIZATION",
|
70 |
+
"DATE": "DATE_OF_BIRTH",
|
71 |
}
|
72 |
},
|
73 |
}
|
|
|
94 |
"nlp_engine_name": "transformers",
|
95 |
"models": [
|
96 |
{
|
97 |
+
"lang_code": "en",
|
98 |
+
"model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
|
99 |
}
|
100 |
],
|
101 |
"ner_model_configuration": {
|
102 |
"model_to_presidio_entity_mapping": {
|
103 |
+
**ENGLISH_ENTITY_MAPPING,
|
104 |
+
"PERSON": "PERSON",
|
105 |
+
"LOC": "ADDRESS",
|
106 |
+
"GPE": "ADDRESS",
|
107 |
+
"ORG": "ORGANIZATION",
|
108 |
+
"DATE": "DATE_OF_BIRTH",
|
109 |
+
"CARDINAL": "NATIONAL_ID_NUMBER",
|
110 |
+
"ID": "COMPANY_ID",
|
111 |
"EMAIL": "EMAIL",
|
112 |
+
"PHONE": "PHONE",
|
113 |
},
|
114 |
"low_confidence_score_multiplier": 0.4,
|
115 |
"low_score_entity_names": ["ID"],
|
|
|
148 |
registry = RecognizerRegistry()
|
149 |
registry.load_predefined_recognizers()
|
150 |
|
151 |
+
if not spacy.util.is_package("en_core_web_sm"):
|
152 |
+
spacy.cli.download("en_core_web_sm")
|
153 |
|
154 |
flair_recognizer = FlairRecognizer(model_path=model_path)
|
155 |
nlp_configuration = {
|
156 |
"nlp_engine_name": "spacy",
|
157 |
+
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
|
158 |
}
|
159 |
registry.add_recognizer(flair_recognizer)
|
160 |
registry.remove_recognizer("SpacyRecognizer")
|
|
|
184 |
)
|
185 |
nlp_configuration = {
|
186 |
"nlp_engine_name": "spacy",
|
187 |
+
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
|
188 |
}
|
189 |
|
190 |
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
|
|
192 |
registry.add_recognizer(azure_ai_language_recognizer)
|
193 |
registry.remove_recognizer("SpacyRecognizer")
|
194 |
|
195 |
+
return nlp_engine, registry
|