File size: 6,702 Bytes
1e7dab8
 
 
 
 
 
 
 
 
 
 
46dbc0f
1e7dab8
 
 
 
46dbc0f
1e7dab8
 
 
 
 
 
 
 
 
46dbc0f
 
 
 
 
 
1e7dab8
 
 
 
 
 
 
 
 
 
 
 
46dbc0f
1e7dab8
 
 
 
 
 
 
 
 
46dbc0f
1e7dab8
 
 
 
 
 
 
 
 
 
 
 
46dbc0f
1e7dab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

from typing import List, Dict, Optional, Tuple, Type
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_anonymizer.entities import (
    OperatorConfig,
)
from presidio_analyzer.nlp_engine import (
    NlpEngine,
    NlpEngineProvider,
)
from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration


class PiiMaskingService():

    def analyze(self, text: str, model: str):

        entitiesToRecognize=['UK_NHS','EMAIL','AU_ABN','CRYPTO','ID','URL',
                             'AU_MEDICARE','IN_PAN','ORGANIZATION','IN_AADHAAR',
                             'SG_NRIC_FIN','EMAIL_ADDRESS','AU_ACN','US_DRIVER_LICENSE',
                             'IP_ADDRESS','DATE_TIME','LOCATION','PERSON','CREDIT_CARD',
                             'IBAN_CODE','US_BANK_NUMBER','PHONE_NUMBER','MEDICAL_LICENSE',
                             'US_SSN','AU_TFN','US_PASSPORT','US_ITIN','NRP','AGE','GENERIC_PII'
                             ]

        if model == "HuggingFace/obi/deid_roberta_i2b2":
            nlp_engine, registry= self.create_nlp_engine_with_transformers("obi/deid_roberta_i2b2")
        elif model == "flair/ner-english-large":
            nlp_engine, registry= self.create_nlp_engine_with_flair("flair/ner-english-large")

        analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)

        results = analyzer.analyze(text=text, entities=entitiesToRecognize, language='en') 
        print("analyzer results:")
        print(results)

        return results
    
    
    def anonymize(
            self,
            text: str,
            operator: str,
            model: str
            # analyze_results: List[RecognizerResult],
        ):
        operator_config = None
        encrypt_key = "WmZq4t7w!z%C&F)J"

        if operator == 'mask':
            operator_config = {
                "type": "mask",
                "masking_char": "*",
                "chars_to_mask": 15,
                "from_end": False,
            }
        elif operator == "encrypt":
            operator_config = {"key": encrypt_key}
        elif operator == "highlight":
            operator_config = {"lambda": lambda x: x}


        if operator == "highlight":
            operator = "custom"


        analyzer_result = self.analyze(text, model)

        engine = AnonymizerEngine()

            # Invoke the anonymize function with the text, analyzer results and
            # Operators to define the anonymization type.
        result = engine.anonymize(
            text=text,
            operators={"DEFAULT": OperatorConfig(operator, operator_config)},
            analyzer_results=analyzer_result
        )
        print("res:")
        print(result)
        print(result.text)
        print(type(result.text))


        return result.text
    
    
    def create_nlp_engine_with_flair(
            self,
            model_path: str,
    ) -> Tuple[NlpEngine, RecognizerRegistry]:
        """
        Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
        The FlairRecognizer would return results from Flair models, the spaCy model
        would return NlpArtifacts such as POS and lemmas.
        :param model_path: Flair model path.
        """
        from flair_recognizer import FlairRecognizer

        registry = RecognizerRegistry()
        registry.load_predefined_recognizers()

        # there is no official Flair NlpEngine, hence we load it as an additional recognizer

        # if not spacy.util.is_package("en_core_web_sm"):
        #     spacy.cli.download("en_core_web_sm")
        # Using a small spaCy model + a Flair NER model
        flair_recognizer = FlairRecognizer(model_path=model_path)
        nlp_configuration = {
            "nlp_engine_name": "spacy",
            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
        }
        registry.add_recognizer(flair_recognizer)
        registry.remove_recognizer("SpacyRecognizer")

        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

        return nlp_engine, registry
    

    def create_nlp_engine_with_transformers(
            self,
            model_path: str,
    ) -> Tuple[NlpEngine, RecognizerRegistry]:
        """
        Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
        The TransformersRecognizer would return results from Transformers models, the spaCy model
        would return NlpArtifacts such as POS and lemmas.
        :param model_path: HuggingFace model path.
        """
        print(f"Loading Transformers model: {model_path} of type {type(model_path)}")

        nlp_configuration = {
            "nlp_engine_name": "transformers",
            "models": [
                {
                    "lang_code": "en",
                    "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
                }
            ],
            "ner_model_configuration": {
                "model_to_presidio_entity_mapping": {
                    "PER": "PERSON",
                    "PERSON": "PERSON",
                    "LOC": "LOCATION",
                    "LOCATION": "LOCATION",
                    "GPE": "LOCATION",
                    "ORG": "ORGANIZATION",
                    "ORGANIZATION": "ORGANIZATION",
                    "NORP": "NRP",
                    "AGE": "AGE",
                    "ID": "ID",
                    "EMAIL": "EMAIL",
                    "PATIENT": "PERSON",
                    "STAFF": "PERSON",
                    "HOSP": "ORGANIZATION",
                    "PATORG": "ORGANIZATION",
                    "DATE": "DATE_TIME",
                    "TIME": "DATE_TIME",
                    "PHONE": "PHONE_NUMBER",
                    "HCW": "PERSON",
                    "HOSPITAL": "ORGANIZATION",
                    "FACILITY": "LOCATION",
                },
                "low_confidence_score_multiplier": 0.4,
                "low_score_entity_names": ["ID"],
                "labels_to_ignore": [
                    "CARDINAL",
                    "EVENT",
                    "LANGUAGE",
                    "LAW",
                    "MONEY",
                    "ORDINAL",
                    "PERCENT",
                    "PRODUCT",
                    "QUANTITY",
                    "WORK_OF_ART",
                ],
            },
        }

        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

        registry = RecognizerRegistry()
        registry.load_predefined_recognizers(nlp_engine=nlp_engine)

        return nlp_engine, registry