Spaces:
Running
Running
petrsovadina
commited on
Commit
•
9224c4e
1
Parent(s):
2cac169
Update presidio_nlp_engine_config.py
Browse files- presidio_nlp_engine_config.py +41 -61
presidio_nlp_engine_config.py
CHANGED
@@ -10,6 +10,12 @@ from presidio_analyzer.nlp_engine import (
|
|
10 |
|
11 |
logger = logging.getLogger("presidio-streamlit")
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def create_nlp_engine_with_spacy(
|
15 |
model_path: str,
|
@@ -20,23 +26,20 @@ def create_nlp_engine_with_spacy(
|
|
20 |
"""
|
21 |
nlp_configuration = {
|
22 |
"nlp_engine_name": "spacy",
|
23 |
-
"models": [{"lang_code": "
|
24 |
"ner_model_configuration": {
|
25 |
"model_to_presidio_entity_mapping": {
|
26 |
-
|
27 |
-
"PERSON": "
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"ORG": "
|
34 |
-
"ORGANIZATION": "ORGANIZATION",
|
35 |
-
"DATE": "DATE_TIME",
|
36 |
-
"TIME": "DATE_TIME",
|
37 |
},
|
38 |
"low_confidence_score_multiplier": 0.4,
|
39 |
-
"low_score_entity_names": ["ORG", "
|
40 |
},
|
41 |
}
|
42 |
|
@@ -47,7 +50,6 @@ def create_nlp_engine_with_spacy(
|
|
47 |
|
48 |
return nlp_engine, registry
|
49 |
|
50 |
-
|
51 |
def create_nlp_engine_with_stanza(
|
52 |
model_path: str,
|
53 |
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
@@ -57,20 +59,15 @@ def create_nlp_engine_with_stanza(
|
|
57 |
"""
|
58 |
nlp_configuration = {
|
59 |
"nlp_engine_name": "stanza",
|
60 |
-
"models": [{"lang_code": "
|
61 |
"ner_model_configuration": {
|
62 |
"model_to_presidio_entity_mapping": {
|
63 |
-
|
64 |
-
"PERSON": "
|
65 |
-
"
|
66 |
-
"
|
67 |
-
"
|
68 |
-
"
|
69 |
-
"LOCATION": "LOCATION",
|
70 |
-
"ORG": "ORGANIZATION",
|
71 |
-
"ORGANIZATION": "ORGANIZATION",
|
72 |
-
"DATE": "DATE_TIME",
|
73 |
-
"TIME": "DATE_TIME",
|
74 |
}
|
75 |
},
|
76 |
}
|
@@ -82,7 +79,6 @@ def create_nlp_engine_with_stanza(
|
|
82 |
|
83 |
return nlp_engine, registry
|
84 |
|
85 |
-
|
86 |
def create_nlp_engine_with_transformers(
|
87 |
model_path: str,
|
88 |
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
@@ -92,44 +88,32 @@ def create_nlp_engine_with_transformers(
|
|
92 |
would return NlpArtifacts such as POS and lemmas.
|
93 |
:param model_path: HuggingFace model path.
|
94 |
"""
|
95 |
-
|
96 |
|
97 |
nlp_configuration = {
|
98 |
"nlp_engine_name": "transformers",
|
99 |
"models": [
|
100 |
{
|
101 |
-
"lang_code": "
|
102 |
-
"model_name": {"spacy": "
|
103 |
}
|
104 |
],
|
105 |
"ner_model_configuration": {
|
106 |
"model_to_presidio_entity_mapping": {
|
107 |
-
|
108 |
-
"PERSON": "
|
109 |
-
"LOC": "
|
110 |
-
"
|
111 |
-
"
|
112 |
-
"
|
113 |
-
"
|
114 |
-
"
|
115 |
-
"AGE": "AGE",
|
116 |
-
"ID": "ID",
|
117 |
"EMAIL": "EMAIL",
|
118 |
-
"
|
119 |
-
"STAFF": "PERSON",
|
120 |
-
"HOSP": "ORGANIZATION",
|
121 |
-
"PATORG": "ORGANIZATION",
|
122 |
-
"DATE": "DATE_TIME",
|
123 |
-
"TIME": "DATE_TIME",
|
124 |
-
"PHONE": "PHONE_NUMBER",
|
125 |
-
"HCW": "PERSON",
|
126 |
-
"HOSPITAL": "ORGANIZATION",
|
127 |
-
"FACILITY": "LOCATION",
|
128 |
},
|
129 |
"low_confidence_score_multiplier": 0.4,
|
130 |
"low_score_entity_names": ["ID"],
|
131 |
"labels_to_ignore": [
|
132 |
-
"CARDINAL",
|
133 |
"EVENT",
|
134 |
"LANGUAGE",
|
135 |
"LAW",
|
@@ -150,7 +134,6 @@ def create_nlp_engine_with_transformers(
|
|
150 |
|
151 |
return nlp_engine, registry
|
152 |
|
153 |
-
|
154 |
def create_nlp_engine_with_flair(
|
155 |
model_path: str,
|
156 |
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
@@ -165,15 +148,13 @@ def create_nlp_engine_with_flair(
|
|
165 |
registry = RecognizerRegistry()
|
166 |
registry.load_predefined_recognizers()
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
spacy.cli.download("en_core_web_sm")
|
172 |
-
# Using a small spaCy model + a Flair NER model
|
173 |
flair_recognizer = FlairRecognizer(model_path=model_path)
|
174 |
nlp_configuration = {
|
175 |
"nlp_engine_name": "spacy",
|
176 |
-
"models": [{"lang_code": "
|
177 |
}
|
178 |
registry.add_recognizer(flair_recognizer)
|
179 |
registry.remove_recognizer("SpacyRecognizer")
|
@@ -182,7 +163,6 @@ def create_nlp_engine_with_flair(
|
|
182 |
|
183 |
return nlp_engine, registry
|
184 |
|
185 |
-
|
186 |
def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
|
187 |
"""
|
188 |
Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
|
@@ -204,7 +184,7 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
|
|
204 |
)
|
205 |
nlp_configuration = {
|
206 |
"nlp_engine_name": "spacy",
|
207 |
-
"models": [{"lang_code": "
|
208 |
}
|
209 |
|
210 |
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
@@ -212,4 +192,4 @@ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
|
|
212 |
registry.add_recognizer(azure_ai_language_recognizer)
|
213 |
registry.remove_recognizer("SpacyRecognizer")
|
214 |
|
215 |
-
return nlp_engine, registry
|
|
|
10 |
|
11 |
logger = logging.getLogger("presidio-streamlit")
|
12 |
|
13 |
+
CZECH_ENTITY_MAPPING = {
|
14 |
+
"PER": "OSOBA",
|
15 |
+
"LOC": "ADRESA",
|
16 |
+
"ORG": "ORGANIZACE",
|
17 |
+
"MISC": "RŮZNÉ",
|
18 |
+
}
|
19 |
|
20 |
def create_nlp_engine_with_spacy(
|
21 |
model_path: str,
|
|
|
26 |
"""
|
27 |
nlp_configuration = {
|
28 |
"nlp_engine_name": "spacy",
|
29 |
+
"models": [{"lang_code": "cs", "model_name": model_path}],
|
30 |
"ner_model_configuration": {
|
31 |
"model_to_presidio_entity_mapping": {
|
32 |
+
**CZECH_ENTITY_MAPPING,
|
33 |
+
"PERSON": "OSOBA",
|
34 |
+
"GPE": "ADRESA",
|
35 |
+
"LOCATION": "ADRESA",
|
36 |
+
"ORGANIZATION": "ORGANIZACE",
|
37 |
+
"DATE": "DATUM_NAROZENÍ",
|
38 |
+
"CARDINAL": "RODNÉ_ČÍSLO",
|
39 |
+
"ORG": "IČO",
|
|
|
|
|
|
|
40 |
},
|
41 |
"low_confidence_score_multiplier": 0.4,
|
42 |
+
"low_score_entity_names": ["ORG", "ORGANIZACE"],
|
43 |
},
|
44 |
}
|
45 |
|
|
|
50 |
|
51 |
return nlp_engine, registry
|
52 |
|
|
|
53 |
def create_nlp_engine_with_stanza(
|
54 |
model_path: str,
|
55 |
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
|
|
59 |
"""
|
60 |
nlp_configuration = {
|
61 |
"nlp_engine_name": "stanza",
|
62 |
+
"models": [{"lang_code": "cs", "model_name": model_path}],
|
63 |
"ner_model_configuration": {
|
64 |
"model_to_presidio_entity_mapping": {
|
65 |
+
**CZECH_ENTITY_MAPPING,
|
66 |
+
"PERSON": "OSOBA",
|
67 |
+
"GPE": "ADRESA",
|
68 |
+
"LOCATION": "ADRESA",
|
69 |
+
"ORGANIZATION": "ORGANIZACE",
|
70 |
+
"DATE": "DATUM_NAROZENÍ",
|
|
|
|
|
|
|
|
|
|
|
71 |
}
|
72 |
},
|
73 |
}
|
|
|
79 |
|
80 |
return nlp_engine, registry
|
81 |
|
|
|
82 |
def create_nlp_engine_with_transformers(
|
83 |
model_path: str,
|
84 |
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
|
|
88 |
would return NlpArtifacts such as POS and lemmas.
|
89 |
:param model_path: HuggingFace model path.
|
90 |
"""
|
91 |
+
logger.info(f"Loading Transformers model: {model_path}")
|
92 |
|
93 |
nlp_configuration = {
|
94 |
"nlp_engine_name": "transformers",
|
95 |
"models": [
|
96 |
{
|
97 |
+
"lang_code": "cs",
|
98 |
+
"model_name": {"spacy": "cs_core_news_sm", "transformers": model_path},
|
99 |
}
|
100 |
],
|
101 |
"ner_model_configuration": {
|
102 |
"model_to_presidio_entity_mapping": {
|
103 |
+
**CZECH_ENTITY_MAPPING,
|
104 |
+
"PERSON": "OSOBA",
|
105 |
+
"LOC": "ADRESA",
|
106 |
+
"GPE": "ADRESA",
|
107 |
+
"ORG": "ORGANIZACE",
|
108 |
+
"DATE": "DATUM_NAROZENÍ",
|
109 |
+
"CARDINAL": "RODNÉ_ČÍSLO",
|
110 |
+
"ID": "IČO",
|
|
|
|
|
111 |
"EMAIL": "EMAIL",
|
112 |
+
"PHONE": "TELEFON",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
},
|
114 |
"low_confidence_score_multiplier": 0.4,
|
115 |
"low_score_entity_names": ["ID"],
|
116 |
"labels_to_ignore": [
|
|
|
117 |
"EVENT",
|
118 |
"LANGUAGE",
|
119 |
"LAW",
|
|
|
134 |
|
135 |
return nlp_engine, registry
|
136 |
|
|
|
137 |
def create_nlp_engine_with_flair(
|
138 |
model_path: str,
|
139 |
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
|
|
148 |
registry = RecognizerRegistry()
|
149 |
registry.load_predefined_recognizers()
|
150 |
|
151 |
+
if not spacy.util.is_package("cs_core_news_sm"):
|
152 |
+
spacy.cli.download("cs_core_news_sm")
|
153 |
+
|
|
|
|
|
154 |
flair_recognizer = FlairRecognizer(model_path=model_path)
|
155 |
nlp_configuration = {
|
156 |
"nlp_engine_name": "spacy",
|
157 |
+
"models": [{"lang_code": "cs", "model_name": "cs_core_news_sm"}],
|
158 |
}
|
159 |
registry.add_recognizer(flair_recognizer)
|
160 |
registry.remove_recognizer("SpacyRecognizer")
|
|
|
163 |
|
164 |
return nlp_engine, registry
|
165 |
|
|
|
166 |
def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
|
167 |
"""
|
168 |
Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
|
|
|
184 |
)
|
185 |
nlp_configuration = {
|
186 |
"nlp_engine_name": "spacy",
|
187 |
+
"models": [{"lang_code": "cs", "model_name": "cs_core_news_sm"}],
|
188 |
}
|
189 |
|
190 |
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
|
|
192 |
registry.add_recognizer(azure_ai_language_recognizer)
|
193 |
registry.remove_recognizer("SpacyRecognizer")
|
194 |
|
195 |
+
return nlp_engine, registry
|