KarishmaShirsath commited on
Commit
1e7dab8
1 Parent(s): 495f267

Upload 4 files

Browse files

added more de-identification methods

Files changed (4) hide show
  1. Final_file.py +17 -32
  2. PiiMaskingService.py +183 -0
  3. app.py +30 -13
  4. flair_recognizer.py +186 -0
Final_file.py CHANGED
@@ -733,54 +733,39 @@ class FlairRecognizer2():
733
  text: str,
734
  operator: str,
735
  # analyze_results: List[RecognizerResult],
736
- mask_char: Optional[str] = None,
737
- number_of_chars: Optional[str] = None,
738
- encrypt_key: Optional[str] = None,
739
  ):
740
  """Anonymize identified input using Presidio Anonymizer.
741
  :param text: Full text
742
  :param operator: Operator name
743
- :param mask_char: Mask char (for mask operator)
744
- :param number_of_chars: Number of characters to mask (for mask operator)
745
- :param encrypt_key: Encryption key (for encrypt operator)
746
  :param analyze_results: list of results from presidio analyzer engine
747
  """
748
 
749
- if operator == "mask":
 
 
 
 
 
 
 
 
 
 
 
750
  operator_config = {
751
  "type": "mask",
752
- "masking_char": mask_char,
753
- "chars_to_mask": number_of_chars,
754
  "from_end": False,
755
  }
756
-
757
- # Define operator config
758
  elif operator == "encrypt":
759
  operator_config = {"key": encrypt_key}
760
  elif operator == "highlight":
761
  operator_config = {"lambda": lambda x: x}
762
- else:
763
- operator_config = None
764
 
765
- # Change operator if needed as intermediate step
766
  if operator == "highlight":
767
  operator = "custom"
768
- elif operator == "synthesize":
769
- operator = "replace"
770
- else:
771
- operator = operator
772
-
773
- # res = AnonymizerEngine().anonymize(
774
- # text,
775
- # analyze_results,
776
- # operators={"DEFAULT": OperatorConfig("redact", operator_config)},
777
- # )
778
-
779
-
780
-
781
- entitiesToRecognize=['PHONE_NUMBER', 'PERSON', 'ID', 'LOCATION', 'EMAIL', 'URL', 'CREDIT_CARD', 'AGE', 'DATE_TIME', 'CRYPTO'
782
- 'IP_ADDRESS', 'US_PASSPORT', 'US_BANK_NUMBER'
783
- ]
784
 
785
  analyzer = AnalyzerEngine()
786
 
@@ -794,8 +779,8 @@ class FlairRecognizer2():
794
  # Operators to define the anonymization type.
795
  result = engine.anonymize(
796
  text=text,
797
- analyzer_results=results,
798
- operators={"DEFAULT": OperatorConfig(operator, {"new_value": "BIP"})}
799
  )
800
  print("res:")
801
  print(result)
 
733
  text: str,
734
  operator: str,
735
  # analyze_results: List[RecognizerResult],
 
 
 
736
  ):
737
  """Anonymize identified input using Presidio Anonymizer.
738
  :param text: Full text
739
  :param operator: Operator name
 
 
 
740
  :param analyze_results: list of results from presidio analyzer engine
741
  """
742
 
743
+ entitiesToRecognize=['UK_NHS','EMAIL','AU_ABN','CRYPTO','ID','URL',
744
+ 'AU_MEDICARE','IN_PAN','ORGANIZATION','IN_AADHAAR',
745
+ 'SG_NRIC_FIN','EMAIL_ADDRESS','AU_ACN','US_DRIVER_LICENSE',
746
+ 'IP_ADDRESS','DATE_TIME','LOCATION','PERSON','CREDIT_CARD',
747
+ 'IBAN_CODE','US_BANK_NUMBER','PHONE_NUMBER','MEDICAL_LICENSE',
748
+ 'US_SSN','AU_TFN','US_PASSPORT','US_ITIN','NRP','AGE','GENERIC_PII'
749
+ ]
750
+
751
+ operator_config = None
752
+ encrypt_key = "WmZq4t7w!z%C&F)J"
753
+
754
+ if operator == 'mask':
755
  operator_config = {
756
  "type": "mask",
757
+ "masking_char": "*",
758
+ "chars_to_mask": 10,
759
  "from_end": False,
760
  }
 
 
761
  elif operator == "encrypt":
762
  operator_config = {"key": encrypt_key}
763
  elif operator == "highlight":
764
  operator_config = {"lambda": lambda x: x}
 
 
765
 
766
+
767
  if operator == "highlight":
768
  operator = "custom"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769
 
770
  analyzer = AnalyzerEngine()
771
 
 
779
  # Operators to define the anonymization type.
780
  result = engine.anonymize(
781
  text=text,
782
+ operators={"DEFAULT": OperatorConfig(operator, operator_config)},
783
+ analyzer_results=results
784
  )
785
  print("res:")
786
  print(result)
PiiMaskingService.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List, Dict, Optional, Tuple, Type
3
+ from presidio_anonymizer import AnonymizerEngine
4
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
5
+ from presidio_anonymizer.entities import (
6
+ OperatorConfig,
7
+ )
8
+ from presidio_analyzer.nlp_engine import (
9
+ NlpEngine,
10
+ NlpEngineProvider,
11
+ )
12
+
13
+
14
+ class PiiMaskingService():
15
+
16
+ def analyze(self, text: str):
17
+
18
+ entitiesToRecognize=['UK_NHS','EMAIL','AU_ABN','CRYPTO','ID','URL',
19
+ 'AU_MEDICARE','IN_PAN','ORGANIZATION','IN_AADHAAR',
20
+ 'SG_NRIC_FIN','EMAIL_ADDRESS','AU_ACN','US_DRIVER_LICENSE',
21
+ 'IP_ADDRESS','DATE_TIME','LOCATION','PERSON','CREDIT_CARD',
22
+ 'IBAN_CODE','US_BANK_NUMBER','PHONE_NUMBER','MEDICAL_LICENSE',
23
+ 'US_SSN','AU_TFN','US_PASSPORT','US_ITIN','NRP','AGE','GENERIC_PII'
24
+ ]
25
+
26
+ a, b= self.create_nlp_engine_with_flair("flair/ner-english-large")
27
+ print(a)
28
+ print(b)
29
+ analyzer = AnalyzerEngine()
30
+
31
+ results = analyzer.analyze(text=text, entities=entitiesToRecognize, language='en')
32
+ print("analyzer results:")
33
+ print(results)
34
+
35
+ return results
36
+
37
+
38
+ def anonymize(
39
+ self,
40
+ text: str,
41
+ operator: str,
42
+ # analyze_results: List[RecognizerResult],
43
+ ):
44
+ operator_config = None
45
+ encrypt_key = "WmZq4t7w!z%C&F)J"
46
+
47
+ if operator == 'mask':
48
+ operator_config = {
49
+ "type": "mask",
50
+ "masking_char": "*",
51
+ "chars_to_mask": 10,
52
+ "from_end": False,
53
+ }
54
+ elif operator == "encrypt":
55
+ operator_config = {"key": encrypt_key}
56
+ elif operator == "highlight":
57
+ operator_config = {"lambda": lambda x: x}
58
+
59
+
60
+ if operator == "highlight":
61
+ operator = "custom"
62
+
63
+
64
+ analyzer_result = self.analyze(text)
65
+
66
+ engine = AnonymizerEngine()
67
+
68
+ # Invoke the anonymize function with the text, analyzer results and
69
+ # Operators to define the anonymization type.
70
+ result = engine.anonymize(
71
+ text=text,
72
+ operators={"DEFAULT": OperatorConfig(operator, operator_config)},
73
+ analyzer_results=analyzer_result
74
+ )
75
+ print("res:")
76
+ print(result)
77
+ print(result.text)
78
+ print(type(result.text))
79
+
80
+
81
+ return result.text
82
+
83
+
84
+ def create_nlp_engine_with_flair(
85
+ self,
86
+ model_path: str,
87
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
88
+ """
89
+ Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
90
+ The FlairRecognizer would return results from Flair models, the spaCy model
91
+ would return NlpArtifacts such as POS and lemmas.
92
+ :param model_path: Flair model path.
93
+ """
94
+ from flair_recognizer import FlairRecognizer
95
+
96
+ registry = RecognizerRegistry()
97
+ registry.load_predefined_recognizers()
98
+
99
+ # there is no official Flair NlpEngine, hence we load it as an additional recognizer
100
+
101
+ # if not spacy.util.is_package("en_core_web_sm"):
102
+ # spacy.cli.download("en_core_web_sm")
103
+ # Using a small spaCy model + a Flair NER model
104
+ flair_recognizer = FlairRecognizer(model_path=model_path)
105
+ nlp_configuration = {
106
+ "nlp_engine_name": "spacy",
107
+ "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
108
+ }
109
+ registry.add_recognizer(flair_recognizer)
110
+ registry.remove_recognizer("SpacyRecognizer")
111
+
112
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
113
+
114
+ return nlp_engine, registry
115
+
116
+
117
+ def create_nlp_engine_with_transformers(
118
+ self,
119
+ model_path: str,
120
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
121
+ """
122
+ Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
123
+ The TransformersRecognizer would return results from Transformers models, the spaCy model
124
+ would return NlpArtifacts such as POS and lemmas.
125
+ :param model_path: HuggingFace model path.
126
+ """
127
+ print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
128
+
129
+ nlp_configuration = {
130
+ "nlp_engine_name": "transformers",
131
+ "models": [
132
+ {
133
+ "lang_code": "en",
134
+ "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
135
+ }
136
+ ],
137
+ "ner_model_configuration": {
138
+ "model_to_presidio_entity_mapping": {
139
+ "PER": "PERSON",
140
+ "PERSON": "PERSON",
141
+ "LOC": "LOCATION",
142
+ "LOCATION": "LOCATION",
143
+ "GPE": "LOCATION",
144
+ "ORG": "ORGANIZATION",
145
+ "ORGANIZATION": "ORGANIZATION",
146
+ "NORP": "NRP",
147
+ "AGE": "AGE",
148
+ "ID": "ID",
149
+ "EMAIL": "EMAIL",
150
+ "PATIENT": "PERSON",
151
+ "STAFF": "PERSON",
152
+ "HOSP": "ORGANIZATION",
153
+ "PATORG": "ORGANIZATION",
154
+ "DATE": "DATE_TIME",
155
+ "TIME": "DATE_TIME",
156
+ "PHONE": "PHONE_NUMBER",
157
+ "HCW": "PERSON",
158
+ "HOSPITAL": "ORGANIZATION",
159
+ "FACILITY": "LOCATION",
160
+ },
161
+ "low_confidence_score_multiplier": 0.4,
162
+ "low_score_entity_names": ["ID"],
163
+ "labels_to_ignore": [
164
+ "CARDINAL",
165
+ "EVENT",
166
+ "LANGUAGE",
167
+ "LAW",
168
+ "MONEY",
169
+ "ORDINAL",
170
+ "PERCENT",
171
+ "PRODUCT",
172
+ "QUANTITY",
173
+ "WORK_OF_ART",
174
+ ],
175
+ },
176
+ }
177
+
178
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
179
+
180
+ registry = RecognizerRegistry()
181
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
182
+
183
+ return nlp_engine, registry
app.py CHANGED
@@ -8,6 +8,7 @@ import docx
8
  from fpdf import FPDF
9
  import io
10
  from docx import Document
 
11
 
12
  # Cache the model loading and prediction function
13
  @st.cache_resource
@@ -23,6 +24,10 @@ def cached_analyze_text(text, operator):
23
  def cached_anonimize_text(text, operator):
24
  return FlairRecognizer2.anonymize(text, operator)
25
 
 
 
 
 
26
  def download_masked_file(masked_text, file_extension):
27
 
28
  # Create a temporary file to store the masked text
@@ -73,29 +78,38 @@ def main():
73
 
74
  st_operator = st.sidebar.selectbox(
75
  "De-identification approach",
76
- ["redact", "replace", "hash"],
77
  index=1,
78
  help="""
79
  Select which manipulation to the text is requested after PII has been identified.\n
80
  - Redact: Completely remove the PII text\n
81
  - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
82
- - Synthesize: Replace with fake values (requires an OpenAI key)\n
83
  - Highlight: Shows the original text with PII highlighted in colors\n
84
  - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
85
  - Hash: Replaces with the hash of the PII string\n
86
  - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
87
  """,
88
  )
89
- # # Dropdown menu with four choices
90
- # st.sidebar.header('Masking Options')
91
- # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
 
 
 
 
 
 
 
 
 
92
  masked_text_public = ''
93
  if upload_option == 'Text Input':
94
  input_text = st.text_area("Enter text here:")
95
  if st.button('Analyze'):
96
  with st.spinner('Wait for it... the model is loading'):
97
- cached_predict_ner_tags(input_text)
98
- masked_text = cached_anonimize_text(input_text, st_operator)
 
99
  st.text_area("Masked text:", value=masked_text, height=200)
100
  elif upload_option == 'File Upload':
101
  uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
@@ -106,8 +120,9 @@ def main():
106
  extracted_text = extract_text_from_pdf(uploaded_file)
107
  if st.button('Analyze'):
108
  with st.spinner('Wait for it... the model is loading'):
109
- cached_predict_ner_tags(extracted_text)
110
- masked_text = cached_analyze_text(extracted_text)
 
111
  st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
112
  if extracted_text:
113
  pdf = create_pdf(masked_text)
@@ -128,8 +143,9 @@ def main():
128
  text += paragraph.text
129
  if st.button('Analyze'):
130
  with st.spinner('Wait for it... the model is loading'):
131
- cached_predict_ner_tags(text)
132
- masked_text = cached_analyze_text(text)
 
133
  st.text_area("Masked text:", value=masked_text, height=200)
134
  #create word file
135
  doc_io = create_word_file(masked_text)
@@ -138,8 +154,9 @@ def main():
138
  else:
139
  if st.button('Analyze'):
140
  with st.spinner('Wait for it... the model is loading'):
141
- cached_predict_ner_tags(file_contents.decode())
142
- masked_text = cached_analyze_text(file_contents.decode())
 
143
  st.text_area("Masked text:", value=masked_text, height=200)
144
  st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")
145
 
 
8
  from fpdf import FPDF
9
  import io
10
  from docx import Document
11
+ from PiiMaskingService import PiiMaskingService
12
 
13
  # Cache the model loading and prediction function
14
  @st.cache_resource
 
24
  def cached_anonimize_text(text, operator):
25
  return FlairRecognizer2.anonymize(text, operator)
26
 
27
+ @st.cache_resource
28
+ def anonymize(text, operator):
29
+ return PiiMaskingService().anonymize(text, operator)
30
+
31
  def download_masked_file(masked_text, file_extension):
32
 
33
  # Create a temporary file to store the masked text
 
78
 
79
  st_operator = st.sidebar.selectbox(
80
  "De-identification approach",
81
+ ["redact", "replace", "encrypt", "hash", "mask"],
82
  index=1,
83
  help="""
84
  Select which manipulation to the text is requested after PII has been identified.\n
85
  - Redact: Completely remove the PII text\n
86
  - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
 
87
  - Highlight: Shows the original text with PII highlighted in colors\n
88
  - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
89
  - Hash: Replaces with the hash of the PII string\n
90
  - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
91
  """,
92
  )
93
+
94
+ # st_model = st.sidebar.selectbox(
95
+ # "NER model package",
96
+ # [
97
+ # "spaCy/en_core_web_lg",
98
+ # "flair/ner-english-large",
99
+ # "HuggingFace/obi/deid_roberta_i2b2",
100
+ # "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
101
+ # ],
102
+ # index=2,
103
+ # )
104
+
105
  masked_text_public = ''
106
  if upload_option == 'Text Input':
107
  input_text = st.text_area("Enter text here:")
108
  if st.button('Analyze'):
109
  with st.spinner('Wait for it... the model is loading'):
110
+ # cached_predict_ner_tags(input_text)
111
+ masked_text = anonymize(input_text, st_operator)
112
+ # masked_text = cached_anonimize_text(input_text, st_operator)
113
  st.text_area("Masked text:", value=masked_text, height=200)
114
  elif upload_option == 'File Upload':
115
  uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
 
120
  extracted_text = extract_text_from_pdf(uploaded_file)
121
  if st.button('Analyze'):
122
  with st.spinner('Wait for it... the model is loading'):
123
+ # cached_predict_ner_tags(extracted_text)
124
+ masked_text = anonymize(extracted_text, st_operator)
125
+ # masked_text = cached_analyze_text(extracted_text)
126
  st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
127
  if extracted_text:
128
  pdf = create_pdf(masked_text)
 
143
  text += paragraph.text
144
  if st.button('Analyze'):
145
  with st.spinner('Wait for it... the model is loading'):
146
+ # cached_predict_ner_tags(text)
147
+ masked_text = anonymize(text, st_operator)
148
+ # masked_text = cached_analyze_text(text)
149
  st.text_area("Masked text:", value=masked_text, height=200)
150
  #create word file
151
  doc_io = create_word_file(masked_text)
 
154
  else:
155
  if st.button('Analyze'):
156
  with st.spinner('Wait for it... the model is loading'):
157
+ # cached_predict_ner_tags(file_contents.decode())
158
+ # masked_text = cached_analyze_text(file_contents.decode())
159
+ masked_text = anonymize(file_contents.decode(), st_operator)
160
  st.text_area("Masked text:", value=masked_text, height=200)
161
  st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")
162
 
flair_recognizer.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, List, Tuple, Set
3
+
4
+ from presidio_analyzer import (
5
+ RecognizerResult,
6
+ EntityRecognizer,
7
+ AnalysisExplanation,
8
+ )
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts
10
+
11
+ from flair.data import Sentence
12
+ from flair.models import SequenceTagger
13
+
14
+
15
+ logger = logging.getLogger("presidio-analyzer")
16
+
17
+
18
+ class FlairRecognizer(EntityRecognizer):
19
+ """
20
+ Wrapper for a flair model, if needed to be used within Presidio Analyzer.
21
+ :example:
22
+ >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
23
+ >flair_recognizer = FlairRecognizer()
24
+ >registry = RecognizerRegistry()
25
+ >registry.add_recognizer(flair_recognizer)
26
+ >analyzer = AnalyzerEngine(registry=registry)
27
+ >results = analyzer.analyze(
28
+ > "My name is Christopher and I live in Irbid.",
29
+ > language="en",
30
+ > return_decision_process=True,
31
+ >)
32
+ >for result in results:
33
+ > print(result)
34
+ > print(result.analysis_explanation)
35
+ """
36
+
37
+ ENTITIES = [
38
+ "LOCATION",
39
+ "PERSON",
40
+ "ORGANIZATION",
41
+ # "MISCELLANEOUS" # - There are no direct correlation with Presidio entities.
42
+ ]
43
+
44
+ DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
45
+
46
+ CHECK_LABEL_GROUPS = [
47
+ ({"LOCATION"}, {"LOC", "LOCATION"}),
48
+ ({"PERSON"}, {"PER", "PERSON"}),
49
+ ({"ORGANIZATION"}, {"ORG"}),
50
+ # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
51
+ ]
52
+
53
+ MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
54
+
55
+ PRESIDIO_EQUIVALENCES = {
56
+ "PER": "PERSON",
57
+ "LOC": "LOCATION",
58
+ "ORG": "ORGANIZATION",
59
+ # 'MISC': 'MISCELLANEOUS' # - Probably not PII
60
+ }
61
+
62
+ def __init__(
63
+ self,
64
+ supported_language: str = "en",
65
+ supported_entities: Optional[List[str]] = None,
66
+ check_label_groups: Optional[Tuple[Set, Set]] = None,
67
+ model: SequenceTagger = None,
68
+ model_path: Optional[str] = None,
69
+ ):
70
+ self.check_label_groups = (
71
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
72
+ )
73
+
74
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
75
+
76
+ if model and model_path:
77
+ raise ValueError("Only one of model or model_path should be provided.")
78
+ elif model and not model_path:
79
+ self.model = model
80
+ elif not model and model_path:
81
+ print(f"Loading model from {model_path}")
82
+ self.model = SequenceTagger.load(model_path)
83
+ else:
84
+ print(f"Loading model for language {supported_language}")
85
+ self.model = SequenceTagger.load(
86
+ self.MODEL_LANGUAGES.get(supported_language)
87
+ )
88
+
89
+ super().__init__(
90
+ supported_entities=supported_entities,
91
+ supported_language=supported_language,
92
+ name="Flair Analytics",
93
+ )
94
+
95
+ def load(self) -> None:
96
+ """Load the model, not used. Model is loaded during initialization."""
97
+ pass
98
+
99
+ def get_supported_entities(self) -> List[str]:
100
+ """
101
+ Return supported entities by this model.
102
+ :return: List of the supported entities.
103
+ """
104
+ return self.supported_entities
105
+
106
+ # Class to use Flair with Presidio as an external recognizer.
107
+ def analyze(
108
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
109
+ ) -> List[RecognizerResult]:
110
+ """
111
+ Analyze text using Text Analytics.
112
+ :param text: The text for analysis.
113
+ :param entities: Not working properly for this recognizer.
114
+ :param nlp_artifacts: Not used by this recognizer.
115
+ :param language: Text language. Supported languages in MODEL_LANGUAGES
116
+ :return: The list of Presidio RecognizerResult constructed from the recognized
117
+ Flair detections.
118
+ """
119
+
120
+ results = []
121
+
122
+ sentences = Sentence(text)
123
+ self.model.predict(sentences)
124
+
125
+ # If there are no specific list of entities, we will look for all of it.
126
+ if not entities:
127
+ entities = self.supported_entities
128
+
129
+ for entity in entities:
130
+ if entity not in self.supported_entities:
131
+ continue
132
+
133
+ for ent in sentences.get_spans("ner"):
134
+ if not self.__check_label(
135
+ entity, ent.labels[0].value, self.check_label_groups
136
+ ):
137
+ continue
138
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
139
+ ent.labels[0].value
140
+ )
141
+ explanation = self.build_flair_explanation(
142
+ round(ent.score, 2), textual_explanation
143
+ )
144
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
145
+
146
+ results.append(flair_result)
147
+
148
+ return results
149
+
150
+ def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
151
+ entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
152
+ flair_score = round(entity.score, 2)
153
+
154
+ flair_results = RecognizerResult(
155
+ entity_type=entity_type,
156
+ start=entity.start_position,
157
+ end=entity.end_position,
158
+ score=flair_score,
159
+ analysis_explanation=explanation,
160
+ )
161
+
162
+ return flair_results
163
+
164
+ def build_flair_explanation(
165
+ self, original_score: float, explanation: str
166
+ ) -> AnalysisExplanation:
167
+ """
168
+ Create explanation for why this result was detected.
169
+ :param original_score: Score given by this recognizer
170
+ :param explanation: Explanation string
171
+ :return:
172
+ """
173
+ explanation = AnalysisExplanation(
174
+ recognizer=self.__class__.__name__,
175
+ original_score=original_score,
176
+ textual_explanation=explanation,
177
+ )
178
+ return explanation
179
+
180
+ @staticmethod
181
+ def __check_label(
182
+ entity: str, label: str, check_label_groups: Tuple[Set, Set]
183
+ ) -> bool:
184
+ return any(
185
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
186
+ )