harshildarji commited on
Commit
c1d2a54
·
verified ·
1 Parent(s): 69f0ad7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -61
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import re
2
  import string
 
3
 
4
  import seaborn as sns
5
  import streamlit as st
@@ -12,10 +13,12 @@ from transformers import (
12
  )
13
 
14
 
15
- # Setup & Constants
16
  def setup_page():
17
  st.set_page_config(
18
- page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
 
 
19
  )
20
  logging.set_verbosity(logging.ERROR)
21
  st.markdown(
@@ -115,11 +118,12 @@ def load_ner_model():
115
 
116
 
117
  @st.cache_data(show_spinner=False)
118
- def ner_merge_lines(text):
119
  ner = load_ner_model()
120
  merged_lines = []
121
  for line in text.splitlines():
122
  if not line.strip():
 
123
  continue
124
  tokens = ner(line)
125
  merged = merge_entities(tokens)
@@ -134,7 +138,6 @@ def merge_entities(entities):
134
  merged = [ents[0].copy()]
135
  merged[0]["score_sum"] = ents[0]["score"]
136
  merged[0]["count"] = 1
137
-
138
  for ent in ents[1:]:
139
  prev = merged[-1]
140
  if ent["index"] == prev["index"] + 1:
@@ -151,11 +154,9 @@ def merge_entities(entities):
151
  new_ent["score_sum"] = ent["score"]
152
  new_ent["count"] = 1
153
  merged.append(new_ent)
154
-
155
  if "score_sum" in merged[-1]:
156
  merged[-1]["score"] = merged[-1]["score_sum"] / merged[-1]["count"]
157
  del merged[-1]["score_sum"], merged[-1]["count"]
158
-
159
  final = []
160
  for ent in merged:
161
  w = ent["word"].strip()
@@ -175,6 +176,34 @@ def truncate(number, decimals=2):
175
  return int(number * factor) / factor
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def highlight_entities(
179
  line,
180
  merged_entities,
@@ -198,11 +227,13 @@ def highlight_entities(
198
  truncated_score = truncate(ent["score"], 2)
199
  tooltip = f"{label_desc} ({truncated_score:.2f})"
200
  color = ENTITY_COLORS.get(label, "#cccccc")
 
201
  html += line[last_end:start]
202
 
203
  should_anonymize = any(
204
  label in entity_importance[level] for level in importance_levels
205
  )
 
206
  if should_anonymize:
207
  key = (ent["word"].lower(), label)
208
  if key not in anonymized_map:
@@ -210,11 +241,14 @@ def highlight_entities(
210
  suffix = chr(ord("A") + count)
211
  label_counters[label] = count + 1
212
  anonymized_map[key] = suffix
 
213
  suffix = anonymized_map[key]
214
  display = f"{label_desc} {suffix}"
 
215
  normalized_word = ent["word"].strip().lower()
216
  display_key = f"{label_desc} {suffix} : {normalized_word}"
217
- if display_key not in allowed_keys:
 
218
  display = ent["word"]
219
  style = ""
220
  css_class = "entity"
@@ -226,7 +260,7 @@ def highlight_entities(
226
  style = ""
227
  css_class = "entity"
228
 
229
- html += f'<span class="{css_class}" style="{style}">{display}<span class="tooltip">{tooltip}</span></span>'
230
  last_end = end
231
 
232
  html += line[last_end:]
@@ -243,6 +277,7 @@ def main():
243
  st.session_state.manual_phrases = []
244
 
245
  st.markdown("#### Juristische Anonymisierung")
 
246
  uploaded_file = st.file_uploader(
247
  "Bitte laden Sie eine .txt-Datei hoch:", type="txt"
248
  )
@@ -263,7 +298,8 @@ def main():
263
  ent_list = [entity_labels[k] for k in entity_importance[level]]
264
  st.markdown(f"**{label}**: {', '.join(ent_list)}")
265
 
266
- threshold = st.slider("Schwellenwert für das Modellvertrauen:", 0.0, 1.0, 0.8, 0.01)
 
267
  st.markdown("---")
268
 
269
  if uploaded_file:
@@ -277,18 +313,16 @@ def main():
277
  with st.spinner("Modell wird einmalig auf die Datei angewendet..."):
278
  merged_all_lines = ner_merge_lines(text)
279
 
 
280
  manual_phrases = st.session_state.manual_phrases
281
  overlap_warnings = set()
282
-
283
  for idx, (line, merged) in enumerate(merged_all_lines):
284
  for phrase in manual_phrases:
285
  for match in re.finditer(re.escape(phrase), line.lower()):
286
  start, end = match.start(), match.end()
287
-
288
  if any(start < e["end"] and end > e["start"] for e in merged):
289
  overlap_warnings.add(phrase)
290
  continue
291
-
292
  merged.append(
293
  {
294
  "start": start,
@@ -299,75 +333,140 @@ def main():
299
  "index": 9999,
300
  }
301
  )
302
-
303
  merged_all_lines[idx] = (line, sorted(merged, key=lambda x: x["start"]))
304
 
305
- label_counters = {}
306
- anonymized_map = {}
307
- all_display_keys = []
 
308
 
309
  for _, merged in merged_all_lines:
310
  for ent in merged:
311
  label = ent["entity"].split("-")[-1]
312
  if any(label in entity_importance[lvl] for lvl in importance_levels):
313
- key = (ent["word"].lower(), label)
314
- if key not in anonymized_map:
315
- count = label_counters.get(label, 0)
316
- suffix = chr(ord("A") + count)
317
- label_counters[label] = count + 1
318
- anonymized_map[key] = suffix
319
- suffix = anonymized_map[key]
320
- normalized_word = ent["word"].strip().lower()
321
- display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
322
- if display not in all_display_keys:
323
- all_display_keys.append(display)
324
-
325
- all_display_keys.sort(key=lambda tag: tag.lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
  with st.sidebar:
328
  st.markdown("### Neue Phrase schwärzen:")
329
-
330
  if "manual_phrases" not in st.session_state:
331
  st.session_state.manual_phrases = []
332
 
333
  with st.form("manual_add_form"):
334
  new_phrase = st.text_input("Neue Phrase:")
335
  submitted = st.form_submit_button("Hinzufügen")
336
- with st.sidebar.expander(
337
- "Hinweise zu manuellen Phrasen", expanded=False
338
- ):
339
- st.markdown("**Noch in Entwicklung**")
340
- st.markdown(
341
- "_Manuelle Schwärzungen können fehlschlagen, wenn sich die Phrase mit bereits erkannten Entitäten überschneidet oder über mehrere Zeilen erstreckt._"
342
- )
343
 
344
- if submitted and new_phrase.strip():
345
- cleaned = new_phrase.strip().lower()
346
- if cleaned not in st.session_state.manual_phrases:
347
- st.session_state.manual_phrases.append(cleaned)
348
- st.rerun()
 
 
 
 
 
 
 
 
349
 
350
  st.markdown("---")
351
  st.markdown("### Anonymisierte Entitäten verwalten:")
352
- selected_keys = []
353
- for label_code in sorted(
354
- set(k[1] for k in anonymized_map.keys()),
355
- key=lambda x: entity_labels.get(x, x),
356
- ):
357
- group = [k for k in anonymized_map if k[1] == label_code]
358
- label_name = entity_labels[label_code]
359
- st.markdown(f"**{label_name}**")
360
- for key in sorted(group, key=lambda k: anonymized_map[k]):
361
- suffix = anonymized_map[key]
362
- normalized_word = key[0].strip().lower()
363
- entity_display = f"{label_name} {suffix} : {normalized_word}"
364
- if st.checkbox(entity_display, value=True, key=entity_display):
365
- selected_keys.append(entity_display)
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  anonymized_lines = []
368
  for line, merged in merged_all_lines:
369
  if not line.strip():
370
- st.markdown("<br>", unsafe_allow_html=True)
371
  anonymized_lines.append("")
372
  continue
373
 
@@ -376,18 +475,20 @@ def main():
376
  merged,
377
  importance_levels,
378
  threshold,
379
- label_counters,
380
  anonymized_map,
381
- selected_keys,
382
  entity_labels,
383
  entity_importance,
384
  ENTITY_COLORS,
385
  )
 
386
  st.markdown(
387
- f'<div style="margin-bottom:0.8rem; line-height:1.8;">{html_line}</div>',
388
  unsafe_allow_html=True,
389
  )
390
- cleaned = re.sub(r'<span class="tooltip">.*?</span>', "", html_line)
 
391
  text_only = re.sub(r"<[^>]+>", "", cleaned)
392
  anonymized_lines.append(text_only.strip())
393
 
 
1
  import re
2
  import string
3
+ from collections import defaultdict
4
 
5
  import seaborn as sns
6
  import streamlit as st
 
13
  )
14
 
15
 
16
+ # Setup
17
  def setup_page():
18
  st.set_page_config(
19
+ page_title="Juristische Anonymisierung",
20
+ page_icon="⚖️",
21
+ layout="wide",
22
  )
23
  logging.set_verbosity(logging.ERROR)
24
  st.markdown(
 
118
 
119
 
120
  @st.cache_data(show_spinner=False)
121
+ def ner_merge_lines(text: str):
122
  ner = load_ner_model()
123
  merged_lines = []
124
  for line in text.splitlines():
125
  if not line.strip():
126
+ merged_lines.append((line, []))
127
  continue
128
  tokens = ner(line)
129
  merged = merge_entities(tokens)
 
138
  merged = [ents[0].copy()]
139
  merged[0]["score_sum"] = ents[0]["score"]
140
  merged[0]["count"] = 1
 
141
  for ent in ents[1:]:
142
  prev = merged[-1]
143
  if ent["index"] == prev["index"] + 1:
 
154
  new_ent["score_sum"] = ent["score"]
155
  new_ent["count"] = 1
156
  merged.append(new_ent)
 
157
  if "score_sum" in merged[-1]:
158
  merged[-1]["score"] = merged[-1]["score_sum"] / merged[-1]["count"]
159
  del merged[-1]["score_sum"], merged[-1]["count"]
 
160
  final = []
161
  for ent in merged:
162
  w = ent["word"].strip()
 
176
  return int(number * factor) / factor
177
 
178
 
179
+ # Canonical grouping
180
+ def canonical_key(text: str, label: str):
181
+ s = text.casefold().strip()
182
+
183
+ if label == "RS":
184
+ m = re.search(r"(ecli:[a-z]{2}:[a-z0-9]+:\d{4}:[a-z0-9.\-]+)", s)
185
+ if m:
186
+ original = text[m.start() : m.end()]
187
+ canon = m.group(1).replace(" ", "")
188
+ return (canon, label, original)
189
+
190
+ m = re.search(
191
+ r"((?:[ivxlcdm]+|\d{1,3})\s*[a-zäöüß]{1,3}\s*\d{1,6}\s*/\s*\d{2,4})", s
192
+ )
193
+ if m:
194
+ original = text[m.start() : m.end()].strip()
195
+ canon = re.sub(r"\s+", "", m.group(1))
196
+ return (canon, label, original)
197
+
198
+ cleaned = re.sub(r"[^\w]+", "", s)
199
+ return (cleaned, label, text.strip())
200
+
201
+ cleaned_generic = re.sub(r"[^\w]+", " ", s)
202
+ cleaned_generic = re.sub(r"\s+", " ", cleaned_generic).strip()
203
+ return (cleaned_generic, label, text.strip())
204
+
205
+
206
+ # Rendering
207
  def highlight_entities(
208
  line,
209
  merged_entities,
 
227
  truncated_score = truncate(ent["score"], 2)
228
  tooltip = f"{label_desc} ({truncated_score:.2f})"
229
  color = ENTITY_COLORS.get(label, "#cccccc")
230
+
231
  html += line[last_end:start]
232
 
233
  should_anonymize = any(
234
  label in entity_importance[level] for level in importance_levels
235
  )
236
+
237
  if should_anonymize:
238
  key = (ent["word"].lower(), label)
239
  if key not in anonymized_map:
 
241
  suffix = chr(ord("A") + count)
242
  label_counters[label] = count + 1
243
  anonymized_map[key] = suffix
244
+
245
  suffix = anonymized_map[key]
246
  display = f"{label_desc} {suffix}"
247
+
248
  normalized_word = ent["word"].strip().lower()
249
  display_key = f"{label_desc} {suffix} : {normalized_word}"
250
+
251
+ if allowed_keys and display_key not in allowed_keys:
252
  display = ent["word"]
253
  style = ""
254
  css_class = "entity"
 
260
  style = ""
261
  css_class = "entity"
262
 
263
+ html += f'<span class="{css_class}" style="{style}" title="{tooltip}">{display}</span>'
264
  last_end = end
265
 
266
  html += line[last_end:]
 
277
  st.session_state.manual_phrases = []
278
 
279
  st.markdown("#### Juristische Anonymisierung")
280
+
281
  uploaded_file = st.file_uploader(
282
  "Bitte laden Sie eine .txt-Datei hoch:", type="txt"
283
  )
 
298
  ent_list = [entity_labels[k] for k in entity_importance[level]]
299
  st.markdown(f"**{label}**: {', '.join(ent_list)}")
300
 
301
+ threshold = st.slider("Schwellenwert für das Modellvertrauen:", 0.0, 1.0, 0.5, 0.01)
302
+
303
  st.markdown("---")
304
 
305
  if uploaded_file:
 
313
  with st.spinner("Modell wird einmalig auf die Datei angewendet..."):
314
  merged_all_lines = ner_merge_lines(text)
315
 
316
+ # Manual phrases to RED
317
  manual_phrases = st.session_state.manual_phrases
318
  overlap_warnings = set()
 
319
  for idx, (line, merged) in enumerate(merged_all_lines):
320
  for phrase in manual_phrases:
321
  for match in re.finditer(re.escape(phrase), line.lower()):
322
  start, end = match.start(), match.end()
 
323
  if any(start < e["end"] and end > e["start"] for e in merged):
324
  overlap_warnings.add(phrase)
325
  continue
 
326
  merged.append(
327
  {
328
  "start": start,
 
333
  "index": 9999,
334
  }
335
  )
 
336
  merged_all_lines[idx] = (line, sorted(merged, key=lambda x: x["start"]))
337
 
338
+ # Grouping layer for the sidebar
339
+ groups = defaultdict(
340
+ lambda: {"variants": set(), "displays": set(), "rep": None}
341
+ )
342
 
343
  for _, merged in merged_all_lines:
344
  for ent in merged:
345
  label = ent["entity"].split("-")[-1]
346
  if any(label in entity_importance[lvl] for lvl in importance_levels):
347
+
348
+ variant_norm = ent["word"].strip().lower()
349
+ canon_key, canon_label, display_key = canonical_key(
350
+ ent["word"], label
351
+ )
352
+
353
+ g = groups[(canon_key, canon_label)]
354
+ g["variants"].add(variant_norm)
355
+ g["displays"].add(display_key)
356
+
357
+ # Suffix per canonical group
358
+ label_counters_for_groups = {}
359
+ for (canon_text, label), data in groups.items():
360
+ count = label_counters_for_groups.get(label, 0)
361
+ suffix = chr(ord("A") + count)
362
+ label_counters_for_groups[label] = count + 1
363
+ data["suffix"] = suffix
364
+
365
+ for key, data in groups.items():
366
+ if data["displays"]:
367
+ data["rep"] = max(data["displays"], key=len)
368
+ else:
369
+ data["rep"] = ""
370
+
371
+ anonymized_map = {}
372
+ for (canon_text, label), data in groups.items():
373
+ suffix = data["suffix"]
374
+ for v in data["variants"]:
375
+ anonymized_map[(v, label)] = suffix
376
+
377
+ entity_labels_map = entity_labels
378
+ display_to_variants = {}
379
+ groups_by_label_desc = defaultdict(list)
380
+ all_display_keys = set()
381
+
382
+ for (canon_text, label), data in groups.items():
383
+ label_desc = entity_labels_map.get(label, label)
384
+ suffix = data["suffix"]
385
+ shown = f"{label_desc} {suffix} : {data['rep']}"
386
+ groups_by_label_desc[label_desc].append(shown)
387
+ display_keys = [f"{label_desc} {suffix} : {v}" for v in data["variants"]]
388
+ display_to_variants[shown] = display_keys
389
+ all_display_keys.update(display_keys)
390
+
391
+ label_order = [
392
+ "RS",
393
+ "GS",
394
+ "PER",
395
+ "AN",
396
+ "GRT",
397
+ "VO",
398
+ "VS",
399
+ "VT",
400
+ "EUN",
401
+ "LIT",
402
+ "UN",
403
+ "INN",
404
+ "ORG",
405
+ "MRK",
406
+ "RR",
407
+ "LD",
408
+ "LDS",
409
+ "ST",
410
+ "STR",
411
+ "RED",
412
+ ]
413
+ label_order_desc = [entity_labels_map.get(x, x) for x in label_order]
414
 
415
  with st.sidebar:
416
  st.markdown("### Neue Phrase schwärzen:")
 
417
  if "manual_phrases" not in st.session_state:
418
  st.session_state.manual_phrases = []
419
 
420
  with st.form("manual_add_form"):
421
  new_phrase = st.text_input("Neue Phrase:")
422
  submitted = st.form_submit_button("Hinzufügen")
 
 
 
 
 
 
 
423
 
424
+ with st.sidebar.expander("Hinweise zu manuellen Phrasen", expanded=False):
425
+ st.markdown("**Noch in Entwicklung**")
426
+ st.markdown(
427
+ "_Manuelle Schwärzungen können fehlschlagen, wenn sich die Phrase "
428
+ "mit bereits erkannten Entitäten überschneidet oder über mehrere "
429
+ "Zeilen erstreckt._"
430
+ )
431
+
432
+ if submitted and new_phrase.strip():
433
+ cleaned = new_phrase.strip().lower()
434
+ if cleaned not in st.session_state.manual_phrases:
435
+ st.session_state.manual_phrases.append(cleaned)
436
+ st.rerun()
437
 
438
  st.markdown("---")
439
  st.markdown("### Anonymisierte Entitäten verwalten:")
440
+
441
+ selected_canon = []
442
+ for lab_desc in label_order_desc:
443
+ items = groups_by_label_desc.get(lab_desc, [])
444
+ if not items:
445
+ continue
446
+ st.markdown(f"**{lab_desc}**")
447
+ for shown in sorted(items, key=str.lower):
448
+ checked = st.checkbox(shown, value=True, key=f"chk::{shown}")
449
+ if checked:
450
+ selected_canon.append(shown)
451
+
452
+ if not selected_canon and groups_by_label_desc:
453
+ selected_canon = [
454
+ x for items in groups_by_label_desc.values() for x in items
455
+ ]
456
+
457
+ allowed_keys = set()
458
+ for shown in selected_canon:
459
+ allowed_keys.update(display_to_variants.get(shown, []))
460
+
461
+ if not allowed_keys and all_display_keys:
462
+ allowed_keys = set(all_display_keys)
463
+
464
+ label_counters_runtime = {}
465
 
466
  anonymized_lines = []
467
  for line, merged in merged_all_lines:
468
  if not line.strip():
469
+ st.markdown("<br/>", unsafe_allow_html=True)
470
  anonymized_lines.append("")
471
  continue
472
 
 
475
  merged,
476
  importance_levels,
477
  threshold,
478
+ label_counters_runtime,
479
  anonymized_map,
480
+ allowed_keys,
481
  entity_labels,
482
  entity_importance,
483
  ENTITY_COLORS,
484
  )
485
+
486
  st.markdown(
487
+ f'<div style="white-space: pre-wrap;">{html_line}</div>',
488
  unsafe_allow_html=True,
489
  )
490
+
491
+ cleaned = re.sub(r"<!--.*?-->", "", html_line, flags=re.DOTALL)
492
  text_only = re.sub(r"<[^>]+>", "", cleaned)
493
  anonymized_lines.append(text_only.strip())
494