Spaces:
Running
Running
petrsovadina
commited on
Commit
•
0c295e3
1
Parent(s):
d74013f
Update presidio_helpers.py
Browse files- presidio_helpers.py +11 -6
presidio_helpers.py
CHANGED
@@ -1,21 +1,25 @@
|
|
1 |
import spacy
|
2 |
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
|
3 |
from presidio_anonymizer import AnonymizerEngine
|
4 |
-
from presidio_anonymizer.entities import
|
5 |
|
6 |
# Vytvoření českých rozpoznávačů
|
7 |
def create_czech_recognizers():
|
8 |
recognizers = []
|
9 |
|
|
|
10 |
rodne_cislo_recognizer = PatternRecognizer(supported_entity="RODNÉ_ČÍSLO", patterns=[r'\b\d{6}/\d{4}\b'])
|
11 |
recognizers.append(rodne_cislo_recognizer)
|
12 |
|
|
|
13 |
telefon_recognizer = PatternRecognizer(supported_entity="TELEFON", patterns=[r'\b(?:\+420)?\s?\d{3}\s?\d{3}\s?\d{3}\b'])
|
14 |
recognizers.append(telefon_recognizer)
|
15 |
|
|
|
16 |
email_recognizer = PatternRecognizer(supported_entity="EMAIL", patterns=[r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'])
|
17 |
recognizers.append(email_recognizer)
|
18 |
|
|
|
19 |
ico_recognizer = PatternRecognizer(supported_entity="IČO", patterns=[r'\b\d{8}\b'])
|
20 |
recognizers.append(ico_recognizer)
|
21 |
|
@@ -23,11 +27,12 @@ def create_czech_recognizers():
|
|
23 |
|
24 |
# Funkce pro vytvoření analyzer enginu
|
25 |
def analyzer_engine():
|
26 |
-
nlp = spacy.load("
|
27 |
|
28 |
registry = RecognizerRegistry()
|
29 |
registry.load_predefined_recognizers(nlp_engine=nlp)
|
30 |
|
|
|
31 |
czech_recognizers = create_czech_recognizers()
|
32 |
for recognizer in czech_recognizers:
|
33 |
registry.add_recognizer(recognizer)
|
@@ -45,15 +50,15 @@ def anonymize(text, anonymization_method="replace"):
|
|
45 |
|
46 |
# Zvolení metody anonymizace
|
47 |
if anonymization_method == "replace":
|
48 |
-
|
49 |
elif anonymization_method == "mask":
|
50 |
-
|
51 |
elif anonymization_method == "hash":
|
52 |
-
|
53 |
else:
|
54 |
raise ValueError("Neznámá metoda anonymizace")
|
55 |
|
56 |
# Anonymizuj rozpoznané údaje
|
57 |
-
anonymized_text = anonymizer.anonymize(text=text, analyzer_results=analyzer_results,
|
58 |
|
59 |
return anonymized_text
|
|
|
1 |
import spacy
|
2 |
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
|
3 |
from presidio_anonymizer import AnonymizerEngine
|
4 |
+
from presidio_anonymizer.entities import OperatorConfig
|
5 |
|
6 |
# Vytvoření českých rozpoznávačů
|
7 |
def create_czech_recognizers():
|
8 |
recognizers = []
|
9 |
|
10 |
+
# Rodné číslo
|
11 |
rodne_cislo_recognizer = PatternRecognizer(supported_entity="RODNÉ_ČÍSLO", patterns=[r'\b\d{6}/\d{4}\b'])
|
12 |
recognizers.append(rodne_cislo_recognizer)
|
13 |
|
14 |
+
# Telefonní číslo
|
15 |
telefon_recognizer = PatternRecognizer(supported_entity="TELEFON", patterns=[r'\b(?:\+420)?\s?\d{3}\s?\d{3}\s?\d{3}\b'])
|
16 |
recognizers.append(telefon_recognizer)
|
17 |
|
18 |
+
# Email
|
19 |
email_recognizer = PatternRecognizer(supported_entity="EMAIL", patterns=[r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'])
|
20 |
recognizers.append(email_recognizer)
|
21 |
|
22 |
+
# IČO
|
23 |
ico_recognizer = PatternRecognizer(supported_entity="IČO", patterns=[r'\b\d{8}\b'])
|
24 |
recognizers.append(ico_recognizer)
|
25 |
|
|
|
27 |
|
28 |
# Funkce pro vytvoření analyzer enginu
|
29 |
def analyzer_engine():
|
30 |
+
nlp = spacy.load("cs_core_news_sm") # Změna na český model
|
31 |
|
32 |
registry = RecognizerRegistry()
|
33 |
registry.load_predefined_recognizers(nlp_engine=nlp)
|
34 |
|
35 |
+
# Přidání českých rozpoznávačů
|
36 |
czech_recognizers = create_czech_recognizers()
|
37 |
for recognizer in czech_recognizers:
|
38 |
registry.add_recognizer(recognizer)
|
|
|
50 |
|
51 |
# Zvolení metody anonymizace
|
52 |
if anonymization_method == "replace":
|
53 |
+
operators = {"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})}
|
54 |
elif anonymization_method == "mask":
|
55 |
+
operators = {"DEFAULT": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 4})}
|
56 |
elif anonymization_method == "hash":
|
57 |
+
operators = {"DEFAULT": OperatorConfig("hash", {"hash_type": "sha256"})}
|
58 |
else:
|
59 |
raise ValueError("Neznámá metoda anonymizace")
|
60 |
|
61 |
# Anonymizuj rozpoznané údaje
|
62 |
+
anonymized_text = anonymizer.anonymize(text=text, analyzer_results=analyzer_results, operators=operators)
|
63 |
|
64 |
return anonymized_text
|