arogeriogel commited on
Commit
1decf14
1 Parent(s): 92be4b9

adding presidio

Browse files
Files changed (3) hide show
  1. app.py +95 -6
  2. flair_recognizer.py +219 -0
  3. requirements.txt +6 -1
app.py CHANGED
@@ -1,21 +1,98 @@
 
1
  import streamlit as st
2
  from flair.data import Sentence
3
  from flair.models import SequenceTagger
4
  import re
5
  import logging
 
 
 
 
 
6
 
7
  # Render Streamlit page
8
  st.title("Anonymise your text!")
9
  st.markdown(
10
- "This mini-app anonymises text using Bert. You can find the code on [GitHub(WIP)](#)"
11
  )
12
  # Configure logger
13
  logging.basicConfig(format="\n%(asctime)s\n%(message)s", level=logging.INFO, force=True)
14
 
15
- @st.cache(suppress_st_warning=True)
16
  def load_tagger():
17
  return SequenceTagger.load("flair/ner-english-large")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def anonymise_text(text: str, metadata: str = "", white_listed_words: str = ""):
20
  """anonymise text"""
21
  if st.session_state.n_requests >= 50:
@@ -42,7 +119,8 @@ def anonymise_text(text: str, metadata: str = "", white_listed_words: str = ""):
42
 
43
  # else:
44
  # load tagger
45
- tagger = load_tagger()
 
46
  sentence = Sentence(text)
47
  # predict NER tags
48
  tagger.predict(sentence)
@@ -56,15 +134,16 @@ def anonymise_text(text: str, metadata: str = "", white_listed_words: str = ""):
56
  st.session_state.text_anon = text_anon
57
  logging.info(
58
  f"text: {text}{metadata}{white_listed_words}\n"
 
59
  f"text anonymised: {st.session_state.text_anon}"
60
  )
61
- # def anonymise_text(text: str, metadata: str = "", white_listed_words: str = ""):
62
- # st.session_state.text_anon = "this is anonymised"
63
 
64
  if "text" not in st.session_state:
65
  st.session_state.text = ""
66
  if "text_error" not in st.session_state:
67
  st.session_state.text_error = ""
 
 
68
  if "text_anon" not in st.session_state:
69
  st.session_state.text_anon = ""
70
  if "n_requests" not in st.session_state:
@@ -79,6 +158,14 @@ white_listed_words = st.text_input(
79
  label="Data to be ignored (optional)",
80
  placeholder="inspirational",
81
  )
 
 
 
 
 
 
 
 
82
  # button return true when clicked
83
  anonymise_now = st.button(
84
  label="Anonymise text",
@@ -89,7 +176,9 @@ anonymise_now = st.button(
89
  text_spinner_placeholder = st.empty()
90
  if st.session_state.text_error:
91
  st.error(st.session_state.text_error)
92
-
 
 
93
  if st.session_state.text_anon:
94
  st.markdown("""---""")
95
  st.text_area(label="Text anonymised", value=st.session_state.text_anon, height=100)
 
1
+ import spacy
2
  import streamlit as st
3
  from flair.data import Sentence
4
  from flair.models import SequenceTagger
5
  import re
6
  import logging
7
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
8
+ from presidio_anonymizer import AnonymizerEngine
9
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
10
+ from annotated_text import annotated_text
11
+ from flair_recognizer import FlairRecognizer
12
 
13
  # Render Streamlit page
14
  st.title("Anonymise your text!")
15
  st.markdown(
16
+ "This mini-app anonymises text using Flair. You can find the code on [GitHub(WIP)](#)"
17
  )
18
  # Configure logger
19
  logging.basicConfig(format="\n%(asctime)s\n%(message)s", level=logging.INFO, force=True)
20
 
21
+ @st.cache(suppress_st_warning=True, allow_output_mutation=True, show_spinner=False)
22
  def load_tagger():
23
  return SequenceTagger.load("flair/ner-english-large")
24
 
25
+ @st.cache(allow_output_mutation=True,show_spinner=False)
26
+ def analyzer_engine():
27
+ """Return AnalyzerEngine."""
28
+ # registry = RecognizerRegistry()
29
+ # flair_recognizer = FlairRecognizer()
30
+ # registry.load_predefined_recognizers()
31
+ # registry.add_recognizer(flair_recognizer)
32
+ # analyzer = AnalyzerEngine(registry=registry, supported_languages=["en"])
33
+ analyzer = AnalyzerEngine()
34
+ flair_recognizer = FlairRecognizer()
35
+ analyzer.registry.add_recognizer(flair_recognizer)
36
+
37
+ return analyzer
38
+
39
+ def analyze(**kwargs):
40
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
41
+ if "entities" not in kwargs or "All" in kwargs["entities"]:
42
+ kwargs["entities"] = None
43
+ return analyzer_engine().analyze(**kwargs)
44
+
45
+ def annotate(text, analyze_results,st_entities):
46
+ tokens = []
47
+ # sort by start index
48
+ results = sorted(analyze_results, key=lambda x: x.start)
49
+ for i, res in enumerate(results):
50
+ if i == 0:
51
+ tokens.append(text[:res.start])
52
+
53
+ # append entity text and entity type
54
+ tokens.append((text[res.start: res.end], res.entity_type))
55
+
56
+ # if another entity coming i.e. we're not at the last results element, add text up to next entity
57
+ if i != len(results) - 1:
58
+ tokens.append(text[res.end:results[i+1].start])
59
+ # if no more entities coming, add all remaining text
60
+ else:
61
+ tokens.append(text[res.end:])
62
+ return tokens
63
+
64
+ def get_supported_entities():
65
+ """Return supported entities from the Analyzer Engine."""
66
+ return analyzer_engine().get_supported_entities()
67
+
68
+ st_entities = st.sidebar.multiselect(
69
+ label="Which entities to look for?",
70
+ options=get_supported_entities(),
71
+ default=list(get_supported_entities()),
72
+ )
73
+
74
+ def analyze_text(text: str, st_entities: str):
75
+ if not text:
76
+ st.session_state.text_error = "Please enter your text"
77
+ return
78
+
79
+ with text_spinner_placeholder:
80
+ with st.spinner("Please wait while your text is being analysed..."):
81
+ logging.info(f"This is the text being analysed: {text}")
82
+ analyze_results = analyze(
83
+ text=text,
84
+ entities=st_entities,
85
+ language="en",
86
+ return_decision_process=False,
87
+ )
88
+ st.session_state.annotated_tokens = annotate(text, analyze_results,st_entities)
89
+
90
+ # st.session_state.text_analys=annotated_text(*annotated_tokens)
91
+ logging.info(
92
+ f"text: {text}{metadata}{white_listed_words}\n"
93
+ f"tokens: {st.session_state.annotated_tokens}\n"
94
+ )
95
+
96
  def anonymise_text(text: str, metadata: str = "", white_listed_words: str = ""):
97
  """anonymise text"""
98
  if st.session_state.n_requests >= 50:
 
119
 
120
  # else:
121
  # load tagger
122
+ tagger = load_tagger()
123
+ # tagger = load_tagger()
124
  sentence = Sentence(text)
125
  # predict NER tags
126
  tagger.predict(sentence)
 
134
  st.session_state.text_anon = text_anon
135
  logging.info(
136
  f"text: {text}{metadata}{white_listed_words}\n"
137
+ f"entities: {sentence.get_spans('ner')}\n"
138
  f"text anonymised: {st.session_state.text_anon}"
139
  )
 
 
140
 
141
  if "text" not in st.session_state:
142
  st.session_state.text = ""
143
  if "text_error" not in st.session_state:
144
  st.session_state.text_error = ""
145
+ if "annotated_tokens" not in st.session_state:
146
+ st.session_state.annotated_tokens = ""
147
  if "text_anon" not in st.session_state:
148
  st.session_state.text_anon = ""
149
  if "n_requests" not in st.session_state:
 
158
  label="Data to be ignored (optional)",
159
  placeholder="inspirational",
160
  )
161
+
162
+ # button return true when clicked
163
+ analyze_now = st.button(
164
+ label="Analyse text",
165
+ type="primary",
166
+ on_click=analyze_text,
167
+ args=(text,st_entities,),
168
+ )
169
  # button return true when clicked
170
  anonymise_now = st.button(
171
  label="Anonymise text",
 
176
  text_spinner_placeholder = st.empty()
177
  if st.session_state.text_error:
178
  st.error(st.session_state.text_error)
179
+ if analyze_now:
180
+ # annotated_tokens
181
+ annotated_text(*st.session_state.annotated_tokens)
182
  if st.session_state.text_anon:
183
  st.markdown("""---""")
184
  st.text_area(label="Text anonymised", value=st.session_state.text_anon, height=100)
flair_recognizer.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, List, Tuple, Set
3
+
4
+ from presidio_analyzer import (
5
+ RecognizerResult,
6
+ EntityRecognizer,
7
+ AnalysisExplanation,
8
+ )
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts
10
+
11
+ try:
12
+ from flair.data import Sentence
13
+ from flair.models import SequenceTagger
14
+ except ImportError:
15
+ print("Flair is not installed")
16
+
17
+
18
+ logger = logging.getLogger("presidio-analyzer")
19
+
20
+
21
+ class FlairRecognizer(EntityRecognizer):
22
+ """
23
+ Wrapper for a flair model, if needed to be used within Presidio Analyzer.
24
+
25
+ :example:
26
+ >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
27
+
28
+ >flair_recognizer = FlairRecognizer()
29
+
30
+ >registry = RecognizerRegistry()
31
+ >registry.add_recognizer(flair_recognizer)
32
+
33
+ >analyzer = AnalyzerEngine(registry=registry)
34
+
35
+ >results = analyzer.analyze(
36
+ > "My name is Christopher and I live in Irbid.",
37
+ > language="en",
38
+ > return_decision_process=True,
39
+ >)
40
+ >for result in results:
41
+ > print(result)
42
+ > print(result.analysis_explanation)
43
+
44
+
45
+ """
46
+
47
+ ENTITIES = [
48
+ "LOCATION",
49
+ "PERSON",
50
+ "ORGANIZATION",
51
+ # "MISCELLANEOUS" # - There are no direct correlation with Presidio entities.
52
+ ]
53
+
54
+ DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
55
+
56
+ CHECK_LABEL_GROUPS = [
57
+ ({"LOCATION"}, {"LOC", "LOCATION"}),
58
+ ({"PERSON"}, {"PER", "PERSON"}),
59
+ ({"ORGANIZATION"}, {"ORG"}),
60
+ # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
61
+ ]
62
+
63
+ MODEL_LANGUAGES = {
64
+ "en": "flair/ner-english-large",
65
+ "es": "flair/ner-spanish-large",
66
+ "de": "flair/ner-german-large",
67
+ "nl": "flair/ner-dutch-large",
68
+ }
69
+
70
+ PRESIDIO_EQUIVALENCES = {
71
+ "PER": "PERSON",
72
+ "LOC": "LOCATION",
73
+ "ORG": "ORGANIZATION",
74
+ # 'MISC': 'MISCELLANEOUS' # - Probably not PII
75
+ }
76
+
77
+ def __init__(
78
+ self,
79
+ supported_language: str = "en",
80
+ supported_entities: Optional[List[str]] = None,
81
+ check_label_groups: Optional[Tuple[Set, Set]] = None,
82
+ model: SequenceTagger = None,
83
+ ):
84
+ self.check_label_groups = (
85
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
86
+ )
87
+
88
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
89
+ self.model = (
90
+ model
91
+ if model
92
+ else SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
93
+ )
94
+
95
+ super().__init__(
96
+ supported_entities=supported_entities,
97
+ supported_language=supported_language,
98
+ name="Flair Analytics",
99
+ )
100
+
101
+ def load(self) -> None:
102
+ """Load the model, not used. Model is loaded during initialization."""
103
+ pass
104
+
105
+ def get_supported_entities(self) -> List[str]:
106
+ """
107
+ Return supported entities by this model.
108
+
109
+ :return: List of the supported entities.
110
+ """
111
+ return self.supported_entities
112
+
113
+ # Class to use Flair with Presidio as an external recognizer.
114
+ def analyze(
115
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
116
+ ) -> List[RecognizerResult]:
117
+ """
118
+ Analyze text using Text Analytics.
119
+
120
+ :param text: The text for analysis.
121
+ :param entities: Not working properly for this recognizer.
122
+ :param nlp_artifacts: Not used by this recognizer.
123
+ :param language: Text language. Supported languages in MODEL_LANGUAGES
124
+ :return: The list of Presidio RecognizerResult constructed from the recognized
125
+ Flair detections.
126
+ """
127
+
128
+ results = []
129
+
130
+ sentences = Sentence(text)
131
+ self.model.predict(sentences)
132
+
133
+ # If there are no specific list of entities, we will look for all of it.
134
+ if not entities:
135
+ entities = self.supported_entities
136
+
137
+ for entity in entities:
138
+ if entity not in self.supported_entities:
139
+ continue
140
+
141
+ for ent in sentences.get_spans("ner"):
142
+ if not self.__check_label(
143
+ entity, ent.labels[0].value, self.check_label_groups
144
+ ):
145
+ continue
146
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
147
+ ent.labels[0].value
148
+ )
149
+ explanation = self.build_flair_explanation(
150
+ round(ent.score, 2), textual_explanation
151
+ )
152
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
153
+
154
+ results.append(flair_result)
155
+
156
+ return results
157
+
158
+ def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
159
+
160
+ entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
161
+ flair_score = round(entity.score, 2)
162
+
163
+ flair_results = RecognizerResult(
164
+ entity_type=entity_type,
165
+ start=entity.start_position,
166
+ end=entity.end_position,
167
+ score=flair_score,
168
+ analysis_explanation=explanation,
169
+ )
170
+
171
+ return flair_results
172
+
173
+ def build_flair_explanation(
174
+ self, original_score: float, explanation: str
175
+ ) -> AnalysisExplanation:
176
+ """
177
+ Create explanation for why this result was detected.
178
+
179
+ :param original_score: Score given by this recognizer
180
+ :param explanation: Explanation string
181
+ :return:
182
+ """
183
+ explanation = AnalysisExplanation(
184
+ recognizer=self.__class__.__name__,
185
+ original_score=original_score,
186
+ textual_explanation=explanation,
187
+ )
188
+ return explanation
189
+
190
+ @staticmethod
191
+ def __check_label(
192
+ entity: str, label: str, check_label_groups: Tuple[Set, Set]
193
+ ) -> bool:
194
+ return any(
195
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
196
+ )
197
+
198
+
199
+ if __name__ == "__main__":
200
+
201
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
202
+
203
+ flair_recognizer = (
204
+ FlairRecognizer()
205
+ ) # This would download a very large (+2GB) model on the first run
206
+
207
+ registry = RecognizerRegistry()
208
+ registry.add_recognizer(flair_recognizer)
209
+
210
+ analyzer = AnalyzerEngine(registry=registry)
211
+
212
+ results = analyzer.analyze(
213
+ "My name is Christopher and I live in Irbid.",
214
+ language="en",
215
+ return_decision_process=True,
216
+ )
217
+ for result in results:
218
+ print(result)
219
+ print(result.analysis_explanation)
requirements.txt CHANGED
@@ -1 +1,6 @@
1
- flair==0.11
 
 
 
 
 
 
1
+ flair==0.11
2
+ presidio-anonymizer
3
+ presidio-analyzer
4
+ st-annotated-text
5
+ spacy>=3.0.0,<4.0.0
6
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0.tar.gz#egg=en_core_web_lg