arogeriogel commited on
Commit
dd87ecd
1 Parent(s): 1decf14

adding meatdata and allowed lists

Browse files
Files changed (1) hide show
  1. app.py +168 -98
app.py CHANGED
@@ -1,35 +1,33 @@
1
  import spacy
2
  import streamlit as st
3
- from flair.data import Sentence
4
- from flair.models import SequenceTagger
5
  import re
6
  import logging
7
- from presidio_analyzer.nlp_engine import NlpEngineProvider
8
  from presidio_anonymizer import AnonymizerEngine
9
  from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
10
  from annotated_text import annotated_text
11
  from flair_recognizer import FlairRecognizer
12
 
13
- # Render Streamlit page
 
 
 
 
14
  st.title("Anonymise your text!")
15
  st.markdown(
16
  "This mini-app anonymises text using Flair. You can find the code on [GitHub(WIP)](#)"
17
  )
 
18
  # Configure logger
19
  logging.basicConfig(format="\n%(asctime)s\n%(message)s", level=logging.INFO, force=True)
20
 
21
- @st.cache(suppress_st_warning=True, allow_output_mutation=True, show_spinner=False)
22
- def load_tagger():
23
- return SequenceTagger.load("flair/ner-english-large")
 
24
 
25
  @st.cache(allow_output_mutation=True,show_spinner=False)
26
  def analyzer_engine():
27
  """Return AnalyzerEngine."""
28
- # registry = RecognizerRegistry()
29
- # flair_recognizer = FlairRecognizer()
30
- # registry.load_predefined_recognizers()
31
- # registry.add_recognizer(flair_recognizer)
32
- # analyzer = AnalyzerEngine(registry=registry, supported_languages=["en"])
33
  analyzer = AnalyzerEngine()
34
  flair_recognizer = FlairRecognizer()
35
  analyzer.registry.add_recognizer(flair_recognizer)
@@ -42,143 +40,215 @@ def analyze(**kwargs):
42
  kwargs["entities"] = None
43
  return analyzer_engine().analyze(**kwargs)
44
 
45
- def annotate(text, analyze_results,st_entities):
 
 
46
  tokens = []
 
47
  # sort by start index
48
  results = sorted(analyze_results, key=lambda x: x.start)
49
  for i, res in enumerate(results):
50
- if i == 0:
51
- tokens.append(text[:res.start])
52
-
53
- # append entity text and entity type
54
- tokens.append((text[res.start: res.end], res.entity_type))
55
-
56
- # if another entity coming i.e. we're not at the last results element, add text up to next entity
57
- if i != len(results) - 1:
58
- tokens.append(text[res.end:results[i+1].start])
59
- # if no more entities coming, add all remaining text
60
- else:
61
- tokens.append(text[res.end:])
 
 
 
 
 
62
  return tokens
63
 
64
  def get_supported_entities():
65
  """Return supported entities from the Analyzer Engine."""
66
  return analyzer_engine().get_supported_entities()
67
 
68
- st_entities = st.sidebar.multiselect(
69
- label="Which entities to look for?",
70
- options=get_supported_entities(),
71
- default=list(get_supported_entities()),
72
- )
73
-
74
- def analyze_text(text: str, st_entities: str):
75
- if not text:
76
  st.session_state.text_error = "Please enter your text"
77
  return
78
-
79
  with text_spinner_placeholder:
80
  with st.spinner("Please wait while your text is being analysed..."):
81
- logging.info(f"This is the text being analysed: {text}")
 
 
82
  analyze_results = analyze(
83
- text=text,
84
  entities=st_entities,
85
  language="en",
86
  return_decision_process=False,
87
  )
88
- st.session_state.annotated_tokens = annotate(text, analyze_results,st_entities)
 
 
 
 
 
 
 
89
 
90
- # st.session_state.text_analys=annotated_text(*annotated_tokens)
91
  logging.info(
92
- f"text: {text}{metadata}{white_listed_words}\n"
93
- f"tokens: {st.session_state.annotated_tokens}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- def anonymise_text(text: str, metadata: str = "", white_listed_words: str = ""):
97
- """anonymise text"""
98
  if st.session_state.n_requests >= 50:
99
  st.session_state.text_error = "Too many requests. Please wait a few seconds before anonymising more text."
100
  logging.info(f"Session request limit reached: {st.session_state.n_requests}")
101
  st.session_state.n_requests = 1
102
- return
103
 
104
- st.session_state.text = ""
105
  st.session_state.text_error = ""
106
 
107
- if not text:
108
  st.session_state.text_error = "Please enter your text"
109
  return
110
-
 
 
 
111
  with text_spinner_placeholder:
112
  with st.spinner("Please wait while your text is being anonymised..."):
113
-
114
- # flagged = openai.moderate(prompt)
115
- # if flagged:
116
- # st.session_state.text_error = "Input flagged as inappropriate."
117
- # logging.info(f"Topic: {topic}{mood_output}{style_output}\n")
118
- # return
119
-
120
- # else:
121
- # load tagger
122
- tagger = load_tagger()
123
- # tagger = load_tagger()
124
- sentence = Sentence(text)
125
- # predict NER tags
126
- tagger.predict(sentence)
127
- # iterate over entities and redact
128
- enitities=[e.text for e in sentence.get_spans('ner')]
129
- regex = re.compile('|'.join(map(re.escape, enitities)))
130
- text_anon = regex.sub("<PID>", text)
131
-
132
  st.session_state.text_error = ""
133
  st.session_state.n_requests += 1
134
- st.session_state.text_anon = text_anon
135
  logging.info(
136
- f"text: {text}{metadata}{white_listed_words}\n"
137
- f"entities: {sentence.get_spans('ner')}\n"
138
- f"text anonymised: {st.session_state.text_anon}"
139
  )
140
 
141
- if "text" not in st.session_state:
142
- st.session_state.text = ""
 
 
 
 
 
 
143
  if "text_error" not in st.session_state:
144
  st.session_state.text_error = ""
145
- if "annotated_tokens" not in st.session_state:
146
- st.session_state.annotated_tokens = ""
147
- if "text_anon" not in st.session_state:
148
- st.session_state.text_anon = ""
149
  if "n_requests" not in st.session_state:
150
  st.session_state.n_requests = 0
151
 
152
- text = st.text_input(label="Text to be anonymised", placeholder="Write your text here")
153
- metadata = st.text_input(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  label="Data to be redacted (optional)",
155
- placeholder="inspirational",
 
 
156
  )
157
- white_listed_words = st.text_input(
158
  label="Data to be ignored (optional)",
159
- placeholder="inspirational",
 
 
160
  )
161
 
162
- # button return true when clicked
163
- analyze_now = st.button(
164
- label="Analyse text",
165
- type="primary",
166
- on_click=analyze_text,
167
- args=(text,st_entities,),
168
  )
 
 
 
 
 
169
  # button return true when clicked
170
- anonymise_now = st.button(
171
- label="Anonymise text",
172
- type="primary",
173
- on_click=anonymise_text,
174
- args=(text, metadata, white_listed_words),
175
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  text_spinner_placeholder = st.empty()
177
  if st.session_state.text_error:
178
  st.error(st.session_state.text_error)
179
- if analyze_now:
180
- # annotated_tokens
181
- annotated_text(*st.session_state.annotated_tokens)
182
- if st.session_state.text_anon:
183
- st.markdown("""---""")
184
- st.text_area(label="Text anonymised", value=st.session_state.text_anon, height=100)
 
 
 
 
1
  import spacy
2
  import streamlit as st
 
 
3
  import re
4
  import logging
 
5
  from presidio_anonymizer import AnonymizerEngine
6
  from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
7
  from annotated_text import annotated_text
8
  from flair_recognizer import FlairRecognizer
9
 
10
+
11
+ ###############################
12
+ #### Render Streamlit page ####
13
+ ###############################
14
+
15
  st.title("Anonymise your text!")
16
  st.markdown(
17
  "This mini-app anonymises text using Flair. You can find the code on [GitHub(WIP)](#)"
18
  )
19
+
20
  # Configure logger
21
  logging.basicConfig(format="\n%(asctime)s\n%(message)s", level=logging.INFO, force=True)
22
 
23
+
24
+ ##############################
25
+ ###### Define functions ######
26
+ ##############################
27
 
28
  @st.cache(allow_output_mutation=True,show_spinner=False)
29
  def analyzer_engine():
30
  """Return AnalyzerEngine."""
 
 
 
 
 
31
  analyzer = AnalyzerEngine()
32
  flair_recognizer = FlairRecognizer()
33
  analyzer.registry.add_recognizer(flair_recognizer)
 
40
  kwargs["entities"] = None
41
  return analyzer_engine().analyze(**kwargs)
42
 
43
+ def annotate():
44
+ text = st.session_state.text
45
+ analyze_results = st.session_state.analyze_results
46
  tokens = []
47
+ starts=[]
48
  # sort by start index
49
  results = sorted(analyze_results, key=lambda x: x.start)
50
  for i, res in enumerate(results):
51
+ # if we already have an entity for this token don't add another
52
+ if res.start not in starts:
53
+ if i == 0:
54
+ tokens.append(text[:res.start])
55
+
56
+ # append entity text and entity type
57
+ tokens.append((text[res.start: res.end], res.entity_type))
58
+
59
+ # if another entity coming i.e. we're not at the last results element, add text up to next entity
60
+ if i != len(results) - 1:
61
+ tokens.append(text[res.end:results[i+1].start])
62
+ # if no more entities coming, add all remaining text
63
+ else:
64
+ tokens.append(text[res.end:])
65
+
66
+ # append this token to the list so we don't repeat results per token
67
+ starts.append(res.start)
68
  return tokens
69
 
70
  def get_supported_entities():
71
  """Return supported entities from the Analyzer Engine."""
72
  return analyzer_engine().get_supported_entities()
73
 
74
+ def analyze_text():
75
+ if not st.session_state.text:
 
 
 
 
 
 
76
  st.session_state.text_error = "Please enter your text"
77
  return
78
+
79
  with text_spinner_placeholder:
80
  with st.spinner("Please wait while your text is being analysed..."):
81
+ logging.info(f"This is the text being analysed: {st.session_state.text}")
82
+ st.session_state.text_error = ""
83
+ st.session_state.n_requests += 1
84
  analyze_results = analyze(
85
+ text=st.session_state.text,
86
  entities=st_entities,
87
  language="en",
88
  return_decision_process=False,
89
  )
90
+
91
+ # if st.session_state.metadata:
92
+ # analyze_results = include_manual_input(analyze_results)
93
+
94
+ if st.session_state.allowed_words:
95
+ analyze_results = exclude_manual_input(analyze_results)
96
+
97
+ st.session_state.analyze_results = analyze_results
98
 
 
99
  logging.info(
100
+ f"analyse results: {st.session_state.analyze_results}\n"
101
+ )
102
+
103
+
104
+ # def include_manual_input(analyze_results):
105
+ # analyze_results_extended=[]
106
+ # logging.info(
107
+ # f"analyse results before adding extra words: {analyze_results}\n"
108
+ # )
109
+ # for word in st.session_state.text.split():
110
+ # if word in st.session_state.metadata:
111
+ # [m.start() for m in re.finditer('test', 'test test test test')]
112
+ # analyze_results_extended.append("type: MANUAL, start: 0, end: 3, score: 1.0")
113
+ # logging.info(
114
+ # f"analyse results after adding allowed words: {analyze_results_extended}\n"
115
+ # )
116
+ # return analyze_results
117
+
118
+ def exclude_manual_input(analyze_results):
119
+ analyze_results_fltered=[]
120
+ logging.info(
121
+ f"analyse results before removing allowed words: {analyze_results}\n"
122
  )
123
+ for token in analyze_results:
124
+ if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words:
125
+ analyze_results_fltered.append(token)
126
+ logging.info(
127
+ f"analyse results after removing allowed words: {analyze_results_fltered}\n"
128
+ )
129
+ return analyze_results_fltered
130
+
131
+
132
+ @st.cache(allow_output_mutation=True)
133
+ def anonymizer_engine():
134
+ """Return AnonymizerEngine."""
135
+ return AnonymizerEngine()
136
 
137
+ def anonymise_text():
 
138
  if st.session_state.n_requests >= 50:
139
  st.session_state.text_error = "Too many requests. Please wait a few seconds before anonymising more text."
140
  logging.info(f"Session request limit reached: {st.session_state.n_requests}")
141
  st.session_state.n_requests = 1
 
142
 
 
143
  st.session_state.text_error = ""
144
 
145
+ if not st.session_state.text:
146
  st.session_state.text_error = "Please enter your text"
147
  return
148
+
149
+ if not st.session_state.analyze_results:
150
+ analyze_text()
151
+
152
  with text_spinner_placeholder:
153
  with st.spinner("Please wait while your text is being anonymised..."):
154
+ anon_results = anonymizer_engine().anonymize(st.session_state.text, st.session_state.analyze_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  st.session_state.text_error = ""
156
  st.session_state.n_requests += 1
157
+ st.session_state.anon_results = anon_results
158
  logging.info(
159
+ f"text anonymised: {st.session_state.anon_results}"
 
 
160
  )
161
 
162
+ def clear_results():
163
+ st.session_state.anon_results=""
164
+ st.session_state.analyze_results=""
165
+
166
+ ##############################
167
+ #### Initialize variables ####
168
+ ##############################
169
+
170
  if "text_error" not in st.session_state:
171
  st.session_state.text_error = ""
172
+ if "analyze_results" not in st.session_state:
173
+ st.session_state.analyze_results = ""
174
+ if "anon_results" not in st.session_state:
175
+ st.session_state.anon_results = ""
176
  if "n_requests" not in st.session_state:
177
  st.session_state.n_requests = 0
178
 
179
+ ##############################
180
+ ####### Page arguments #######
181
+ ##############################
182
+
183
+ # Every widget with a key is automatically added to Session State
184
+
185
+ # In Streamlit, interacting with a widget triggers a rerun and variables defined
186
+ # in the code get reinitialized after each rerun.
187
+
188
+ # If a callback function is associated with a widget then a change in the widget
189
+ # triggers the following sequence: First the callback function is executed and then
190
+ # the app executes from top to bottom.
191
+
192
+ st.text_input(
193
+ label="Text",
194
+ placeholder="Write your text here",
195
+ key='text',
196
+ on_change=clear_results
197
+ )
198
+ st.text_input(
199
  label="Data to be redacted (optional)",
200
+ placeholder="John, Mary, London",
201
+ key='metadata',
202
+ on_change=clear_results
203
  )
204
+ st.text_input(
205
  label="Data to be ignored (optional)",
206
+ placeholder="NHS, GEL, Lab",
207
+ key='allowed_words',
208
+ on_change=clear_results
209
  )
210
 
211
+ st_entities = st.sidebar.multiselect(
212
+ label="Which entities to look for?",
213
+ options=get_supported_entities(),
214
+ default=list(get_supported_entities()),
 
 
215
  )
216
+
217
+ ##############################
218
+ ######## Page buttons ########
219
+ ##############################
220
+
221
  # button return true when clicked
222
+
223
+ col1, col2 = st.columns(2)
224
+
225
+ with col1:
226
+ analyze_now = st.button(
227
+ label="Analyse text",
228
+ type="primary",
229
+ on_click=analyze_text,
230
+ )
231
+
232
+ with col2:
233
+ anonymise_now = st.button(
234
+ label="Anonymise text",
235
+ type="primary",
236
+ on_click=anonymise_text,
237
+ )
238
+
239
+ ##############################
240
+ ######## Page actions ########
241
+ ##############################
242
+
243
  text_spinner_placeholder = st.empty()
244
  if st.session_state.text_error:
245
  st.error(st.session_state.text_error)
246
+
247
+ with col1:
248
+ if st.session_state.analyze_results:
249
+ annotated_tokens=annotate()
250
+ annotated_text(*annotated_tokens)
251
+ st.write(st.session_state.analyze_results)
252
+ with col2:
253
+ if st.session_state.anon_results:
254
+ st.write(st.session_state.anon_results.text)