petrsovadina commited on
Commit
0c295e3
1 Parent(s): d74013f

Update presidio_helpers.py

Browse files
Files changed (1) hide show
  1. presidio_helpers.py +11 -6
presidio_helpers.py CHANGED
@@ -1,21 +1,25 @@
1
  import spacy
2
  from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
3
  from presidio_anonymizer import AnonymizerEngine
4
- from presidio_anonymizer.entities import AnonymizerConfig
5
 
6
  # Vytvoření českých rozpoznávačů
7
  def create_czech_recognizers():
8
  recognizers = []
9
 
 
10
  rodne_cislo_recognizer = PatternRecognizer(supported_entity="RODNÉ_ČÍSLO", patterns=[r'\b\d{6}/\d{4}\b'])
11
  recognizers.append(rodne_cislo_recognizer)
12
 
 
13
  telefon_recognizer = PatternRecognizer(supported_entity="TELEFON", patterns=[r'\b(?:\+420)?\s?\d{3}\s?\d{3}\s?\d{3}\b'])
14
  recognizers.append(telefon_recognizer)
15
 
 
16
  email_recognizer = PatternRecognizer(supported_entity="EMAIL", patterns=[r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'])
17
  recognizers.append(email_recognizer)
18
 
 
19
  ico_recognizer = PatternRecognizer(supported_entity="IČO", patterns=[r'\b\d{8}\b'])
20
  recognizers.append(ico_recognizer)
21
 
@@ -23,11 +27,12 @@ def create_czech_recognizers():
23
 
24
  # Funkce pro vytvoření analyzer enginu
25
  def analyzer_engine():
26
- nlp = spacy.load("en_core_web_sm")
27
 
28
  registry = RecognizerRegistry()
29
  registry.load_predefined_recognizers(nlp_engine=nlp)
30
 
 
31
  czech_recognizers = create_czech_recognizers()
32
  for recognizer in czech_recognizers:
33
  registry.add_recognizer(recognizer)
@@ -45,15 +50,15 @@ def anonymize(text, anonymization_method="replace"):
45
 
46
  # Zvolení metody anonymizace
47
  if anonymization_method == "replace":
48
- anonymizer_config = {"DEFAULT": AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"})}
49
  elif anonymization_method == "mask":
50
- anonymizer_config = {"DEFAULT": AnonymizerConfig("mask", {"masking_char": "*", "chars_to_mask": 4, "from_end": True})}
51
  elif anonymization_method == "hash":
52
- anonymizer_config = {"DEFAULT": AnonymizerConfig("hash")}
53
  else:
54
  raise ValueError("Neznámá metoda anonymizace")
55
 
56
  # Anonymizuj rozpoznané údaje
57
- anonymized_text = anonymizer.anonymize(text=text, analyzer_results=analyzer_results, anonymizers_config=anonymizer_config)
58
 
59
  return anonymized_text
 
1
  import spacy
2
  from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
3
  from presidio_anonymizer import AnonymizerEngine
4
+ from presidio_anonymizer.entities import OperatorConfig
5
 
6
  # Vytvoření českých rozpoznávačů
7
  def create_czech_recognizers():
8
  recognizers = []
9
 
10
+ # Rodné číslo
11
  rodne_cislo_recognizer = PatternRecognizer(supported_entity="RODNÉ_ČÍSLO", patterns=[r'\b\d{6}/\d{4}\b'])
12
  recognizers.append(rodne_cislo_recognizer)
13
 
14
+ # Telefonní číslo
15
  telefon_recognizer = PatternRecognizer(supported_entity="TELEFON", patterns=[r'\b(?:\+420)?\s?\d{3}\s?\d{3}\s?\d{3}\b'])
16
  recognizers.append(telefon_recognizer)
17
 
18
+ # Email
19
  email_recognizer = PatternRecognizer(supported_entity="EMAIL", patterns=[r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'])
20
  recognizers.append(email_recognizer)
21
 
22
+ # IČO
23
  ico_recognizer = PatternRecognizer(supported_entity="IČO", patterns=[r'\b\d{8}\b'])
24
  recognizers.append(ico_recognizer)
25
 
 
27
 
28
  # Funkce pro vytvoření analyzer enginu
29
  def analyzer_engine():
30
+ nlp = spacy.load("cs_core_news_sm") # Změna na český model
31
 
32
  registry = RecognizerRegistry()
33
  registry.load_predefined_recognizers(nlp_engine=nlp)
34
 
35
+ # Přidání českých rozpoznávačů
36
  czech_recognizers = create_czech_recognizers()
37
  for recognizer in czech_recognizers:
38
  registry.add_recognizer(recognizer)
 
50
 
51
  # Zvolení metody anonymizace
52
  if anonymization_method == "replace":
53
+ operators = {"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})}
54
  elif anonymization_method == "mask":
55
+ operators = {"DEFAULT": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 4})}
56
  elif anonymization_method == "hash":
57
+ operators = {"DEFAULT": OperatorConfig("hash", {"hash_type": "sha256"})}
58
  else:
59
  raise ValueError("Neznámá metoda anonymizace")
60
 
61
  # Anonymizuj rozpoznané údaje
62
+ anonymized_text = anonymizer.anonymize(text=text, analyzer_results=analyzer_results, operators=operators)
63
 
64
  return anonymized_text