Spaces:
Running
Running
petrsovadina
commited on
Commit
•
2c568ba
1
Parent(s):
0c295e3
Update presidio_helpers.py
Browse files- presidio_helpers.py +39 -1
presidio_helpers.py
CHANGED
@@ -2,6 +2,7 @@ import spacy
|
|
2 |
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
|
3 |
from presidio_anonymizer import AnonymizerEngine
|
4 |
from presidio_anonymizer.entities import OperatorConfig
|
|
|
5 |
|
6 |
# Vytvoření českých rozpoznávačů
|
7 |
def create_czech_recognizers():
|
@@ -61,4 +62,41 @@ def anonymize(text, anonymization_method="replace"):
|
|
61 |
# Anonymizuj rozpoznané údaje
|
62 |
anonymized_text = anonymizer.anonymize(text=text, analyzer_results=analyzer_results, operators=operators)
|
63 |
|
64 |
-
return anonymized_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
|
3 |
from presidio_anonymizer import AnonymizerEngine
|
4 |
from presidio_anonymizer.entities import OperatorConfig
|
5 |
+
from annotated_text import annotated_text
|
6 |
|
7 |
# Vytvoření českých rozpoznávačů
|
8 |
def create_czech_recognizers():
|
|
|
62 |
# Anonymizuj rozpoznané údaje
|
63 |
anonymized_text = anonymizer.anonymize(text=text, analyzer_results=analyzer_results, operators=operators)
|
64 |
|
65 |
+
return anonymized_text
|
66 |
+
|
67 |
+
# Funkce pro anotaci (zvýraznění) textu s rozpoznanými entitami
|
68 |
+
def annotate(text, analyzer_results):
|
69 |
+
annotated_tokens = []
|
70 |
+
last_idx = 0
|
71 |
+
|
72 |
+
for result in analyzer_results:
|
73 |
+
start, end = result.start, result.end
|
74 |
+
entity_text = text[start:end]
|
75 |
+
entity_type = result.entity_type
|
76 |
+
score = result.score
|
77 |
+
|
78 |
+
# Přidání textu před entitou, pokud existuje
|
79 |
+
if last_idx < start:
|
80 |
+
annotated_tokens.append(text[last_idx:start])
|
81 |
+
|
82 |
+
# Přidání zvýrazněné entity
|
83 |
+
annotated_tokens.append((entity_text, entity_type, "#faa"))
|
84 |
+
|
85 |
+
last_idx = end
|
86 |
+
|
87 |
+
# Přidání zbytku textu
|
88 |
+
if last_idx < len(text):
|
89 |
+
annotated_tokens.append(text[last_idx:])
|
90 |
+
|
91 |
+
return annotated_tokens
|
92 |
+
|
93 |
+
# Testovací aplikace pro anotaci
|
94 |
+
if __name__ == "__main__":
|
95 |
+
text = "Jan Novák, narozený 15.3.1980, bydlí na adrese Hlavní 123, Praha 1. Jeho telefonní číslo je +420 123 456 789 a e-mail jan.novak@email.cz. Rodné číslo: 800315/1234, IČO: 12345678."
|
96 |
+
|
97 |
+
analyzer = analyzer_engine()
|
98 |
+
analyzer_results = analyzer.analyze(text=text, entities=None, language="cs")
|
99 |
+
|
100 |
+
# Zobraz anotovaný text
|
101 |
+
tokens = annotate(text, analyzer_results)
|
102 |
+
annotated_text(*tokens)
|