Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import re
|
| 2 |
import string
|
|
|
|
| 3 |
|
| 4 |
import seaborn as sns
|
| 5 |
import streamlit as st
|
|
@@ -12,10 +13,12 @@ from transformers import (
|
|
| 12 |
)
|
| 13 |
|
| 14 |
|
| 15 |
-
# Setup
|
| 16 |
def setup_page():
|
| 17 |
st.set_page_config(
|
| 18 |
-
page_title="Juristische Anonymisierung",
|
|
|
|
|
|
|
| 19 |
)
|
| 20 |
logging.set_verbosity(logging.ERROR)
|
| 21 |
st.markdown(
|
|
@@ -115,11 +118,12 @@ def load_ner_model():
|
|
| 115 |
|
| 116 |
|
| 117 |
@st.cache_data(show_spinner=False)
|
| 118 |
-
def ner_merge_lines(text):
|
| 119 |
ner = load_ner_model()
|
| 120 |
merged_lines = []
|
| 121 |
for line in text.splitlines():
|
| 122 |
if not line.strip():
|
|
|
|
| 123 |
continue
|
| 124 |
tokens = ner(line)
|
| 125 |
merged = merge_entities(tokens)
|
|
@@ -134,7 +138,6 @@ def merge_entities(entities):
|
|
| 134 |
merged = [ents[0].copy()]
|
| 135 |
merged[0]["score_sum"] = ents[0]["score"]
|
| 136 |
merged[0]["count"] = 1
|
| 137 |
-
|
| 138 |
for ent in ents[1:]:
|
| 139 |
prev = merged[-1]
|
| 140 |
if ent["index"] == prev["index"] + 1:
|
|
@@ -151,11 +154,9 @@ def merge_entities(entities):
|
|
| 151 |
new_ent["score_sum"] = ent["score"]
|
| 152 |
new_ent["count"] = 1
|
| 153 |
merged.append(new_ent)
|
| 154 |
-
|
| 155 |
if "score_sum" in merged[-1]:
|
| 156 |
merged[-1]["score"] = merged[-1]["score_sum"] / merged[-1]["count"]
|
| 157 |
del merged[-1]["score_sum"], merged[-1]["count"]
|
| 158 |
-
|
| 159 |
final = []
|
| 160 |
for ent in merged:
|
| 161 |
w = ent["word"].strip()
|
|
@@ -175,6 +176,34 @@ def truncate(number, decimals=2):
|
|
| 175 |
return int(number * factor) / factor
|
| 176 |
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
def highlight_entities(
|
| 179 |
line,
|
| 180 |
merged_entities,
|
|
@@ -198,11 +227,13 @@ def highlight_entities(
|
|
| 198 |
truncated_score = truncate(ent["score"], 2)
|
| 199 |
tooltip = f"{label_desc} ({truncated_score:.2f})"
|
| 200 |
color = ENTITY_COLORS.get(label, "#cccccc")
|
|
|
|
| 201 |
html += line[last_end:start]
|
| 202 |
|
| 203 |
should_anonymize = any(
|
| 204 |
label in entity_importance[level] for level in importance_levels
|
| 205 |
)
|
|
|
|
| 206 |
if should_anonymize:
|
| 207 |
key = (ent["word"].lower(), label)
|
| 208 |
if key not in anonymized_map:
|
|
@@ -210,11 +241,14 @@ def highlight_entities(
|
|
| 210 |
suffix = chr(ord("A") + count)
|
| 211 |
label_counters[label] = count + 1
|
| 212 |
anonymized_map[key] = suffix
|
|
|
|
| 213 |
suffix = anonymized_map[key]
|
| 214 |
display = f"{label_desc} {suffix}"
|
|
|
|
| 215 |
normalized_word = ent["word"].strip().lower()
|
| 216 |
display_key = f"{label_desc} {suffix} : {normalized_word}"
|
| 217 |
-
|
|
|
|
| 218 |
display = ent["word"]
|
| 219 |
style = ""
|
| 220 |
css_class = "entity"
|
|
@@ -226,7 +260,7 @@ def highlight_entities(
|
|
| 226 |
style = ""
|
| 227 |
css_class = "entity"
|
| 228 |
|
| 229 |
-
html += f'<span class="{css_class}" style="{style}"
|
| 230 |
last_end = end
|
| 231 |
|
| 232 |
html += line[last_end:]
|
|
@@ -243,6 +277,7 @@ def main():
|
|
| 243 |
st.session_state.manual_phrases = []
|
| 244 |
|
| 245 |
st.markdown("#### Juristische Anonymisierung")
|
|
|
|
| 246 |
uploaded_file = st.file_uploader(
|
| 247 |
"Bitte laden Sie eine .txt-Datei hoch:", type="txt"
|
| 248 |
)
|
|
@@ -263,7 +298,8 @@ def main():
|
|
| 263 |
ent_list = [entity_labels[k] for k in entity_importance[level]]
|
| 264 |
st.markdown(f"**{label}**: {', '.join(ent_list)}")
|
| 265 |
|
| 266 |
-
threshold = st.slider("Schwellenwert für das Modellvertrauen:", 0.0, 1.0, 0.
|
|
|
|
| 267 |
st.markdown("---")
|
| 268 |
|
| 269 |
if uploaded_file:
|
|
@@ -277,18 +313,16 @@ def main():
|
|
| 277 |
with st.spinner("Modell wird einmalig auf die Datei angewendet..."):
|
| 278 |
merged_all_lines = ner_merge_lines(text)
|
| 279 |
|
|
|
|
| 280 |
manual_phrases = st.session_state.manual_phrases
|
| 281 |
overlap_warnings = set()
|
| 282 |
-
|
| 283 |
for idx, (line, merged) in enumerate(merged_all_lines):
|
| 284 |
for phrase in manual_phrases:
|
| 285 |
for match in re.finditer(re.escape(phrase), line.lower()):
|
| 286 |
start, end = match.start(), match.end()
|
| 287 |
-
|
| 288 |
if any(start < e["end"] and end > e["start"] for e in merged):
|
| 289 |
overlap_warnings.add(phrase)
|
| 290 |
continue
|
| 291 |
-
|
| 292 |
merged.append(
|
| 293 |
{
|
| 294 |
"start": start,
|
|
@@ -299,75 +333,140 @@ def main():
|
|
| 299 |
"index": 9999,
|
| 300 |
}
|
| 301 |
)
|
| 302 |
-
|
| 303 |
merged_all_lines[idx] = (line, sorted(merged, key=lambda x: x["start"]))
|
| 304 |
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
|
|
|
| 308 |
|
| 309 |
for _, merged in merged_all_lines:
|
| 310 |
for ent in merged:
|
| 311 |
label = ent["entity"].split("-")[-1]
|
| 312 |
if any(label in entity_importance[lvl] for lvl in importance_levels):
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
with st.sidebar:
|
| 328 |
st.markdown("### Neue Phrase schwärzen:")
|
| 329 |
-
|
| 330 |
if "manual_phrases" not in st.session_state:
|
| 331 |
st.session_state.manual_phrases = []
|
| 332 |
|
| 333 |
with st.form("manual_add_form"):
|
| 334 |
new_phrase = st.text_input("Neue Phrase:")
|
| 335 |
submitted = st.form_submit_button("Hinzufügen")
|
| 336 |
-
with st.sidebar.expander(
|
| 337 |
-
"Hinweise zu manuellen Phrasen", expanded=False
|
| 338 |
-
):
|
| 339 |
-
st.markdown("**Noch in Entwicklung**")
|
| 340 |
-
st.markdown(
|
| 341 |
-
"_Manuelle Schwärzungen können fehlschlagen, wenn sich die Phrase mit bereits erkannten Entitäten überschneidet oder über mehrere Zeilen erstreckt._"
|
| 342 |
-
)
|
| 343 |
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
st.markdown("---")
|
| 351 |
st.markdown("### Anonymisierte Entitäten verwalten:")
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
|
| 367 |
anonymized_lines = []
|
| 368 |
for line, merged in merged_all_lines:
|
| 369 |
if not line.strip():
|
| 370 |
-
st.markdown("<br
|
| 371 |
anonymized_lines.append("")
|
| 372 |
continue
|
| 373 |
|
|
@@ -376,18 +475,20 @@ def main():
|
|
| 376 |
merged,
|
| 377 |
importance_levels,
|
| 378 |
threshold,
|
| 379 |
-
|
| 380 |
anonymized_map,
|
| 381 |
-
|
| 382 |
entity_labels,
|
| 383 |
entity_importance,
|
| 384 |
ENTITY_COLORS,
|
| 385 |
)
|
|
|
|
| 386 |
st.markdown(
|
| 387 |
-
f'<div style="
|
| 388 |
unsafe_allow_html=True,
|
| 389 |
)
|
| 390 |
-
|
|
|
|
| 391 |
text_only = re.sub(r"<[^>]+>", "", cleaned)
|
| 392 |
anonymized_lines.append(text_only.strip())
|
| 393 |
|
|
|
|
| 1 |
import re
|
| 2 |
import string
|
| 3 |
+
from collections import defaultdict
|
| 4 |
|
| 5 |
import seaborn as sns
|
| 6 |
import streamlit as st
|
|
|
|
| 13 |
)
|
| 14 |
|
| 15 |
|
| 16 |
+
# Setup
|
| 17 |
def setup_page():
|
| 18 |
st.set_page_config(
|
| 19 |
+
page_title="Juristische Anonymisierung",
|
| 20 |
+
page_icon="⚖️",
|
| 21 |
+
layout="wide",
|
| 22 |
)
|
| 23 |
logging.set_verbosity(logging.ERROR)
|
| 24 |
st.markdown(
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
@st.cache_data(show_spinner=False)
|
| 121 |
+
def ner_merge_lines(text: str):
|
| 122 |
ner = load_ner_model()
|
| 123 |
merged_lines = []
|
| 124 |
for line in text.splitlines():
|
| 125 |
if not line.strip():
|
| 126 |
+
merged_lines.append((line, []))
|
| 127 |
continue
|
| 128 |
tokens = ner(line)
|
| 129 |
merged = merge_entities(tokens)
|
|
|
|
| 138 |
merged = [ents[0].copy()]
|
| 139 |
merged[0]["score_sum"] = ents[0]["score"]
|
| 140 |
merged[0]["count"] = 1
|
|
|
|
| 141 |
for ent in ents[1:]:
|
| 142 |
prev = merged[-1]
|
| 143 |
if ent["index"] == prev["index"] + 1:
|
|
|
|
| 154 |
new_ent["score_sum"] = ent["score"]
|
| 155 |
new_ent["count"] = 1
|
| 156 |
merged.append(new_ent)
|
|
|
|
| 157 |
if "score_sum" in merged[-1]:
|
| 158 |
merged[-1]["score"] = merged[-1]["score_sum"] / merged[-1]["count"]
|
| 159 |
del merged[-1]["score_sum"], merged[-1]["count"]
|
|
|
|
| 160 |
final = []
|
| 161 |
for ent in merged:
|
| 162 |
w = ent["word"].strip()
|
|
|
|
| 176 |
return int(number * factor) / factor
|
| 177 |
|
| 178 |
|
| 179 |
+
# Canonical grouping
|
| 180 |
+
def canonical_key(text: str, label: str):
|
| 181 |
+
s = text.casefold().strip()
|
| 182 |
+
|
| 183 |
+
if label == "RS":
|
| 184 |
+
m = re.search(r"(ecli:[a-z]{2}:[a-z0-9]+:\d{4}:[a-z0-9.\-]+)", s)
|
| 185 |
+
if m:
|
| 186 |
+
original = text[m.start() : m.end()]
|
| 187 |
+
canon = m.group(1).replace(" ", "")
|
| 188 |
+
return (canon, label, original)
|
| 189 |
+
|
| 190 |
+
m = re.search(
|
| 191 |
+
r"((?:[ivxlcdm]+|\d{1,3})\s*[a-zäöüß]{1,3}\s*\d{1,6}\s*/\s*\d{2,4})", s
|
| 192 |
+
)
|
| 193 |
+
if m:
|
| 194 |
+
original = text[m.start() : m.end()].strip()
|
| 195 |
+
canon = re.sub(r"\s+", "", m.group(1))
|
| 196 |
+
return (canon, label, original)
|
| 197 |
+
|
| 198 |
+
cleaned = re.sub(r"[^\w]+", "", s)
|
| 199 |
+
return (cleaned, label, text.strip())
|
| 200 |
+
|
| 201 |
+
cleaned_generic = re.sub(r"[^\w]+", " ", s)
|
| 202 |
+
cleaned_generic = re.sub(r"\s+", " ", cleaned_generic).strip()
|
| 203 |
+
return (cleaned_generic, label, text.strip())
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# Rendering
|
| 207 |
def highlight_entities(
|
| 208 |
line,
|
| 209 |
merged_entities,
|
|
|
|
| 227 |
truncated_score = truncate(ent["score"], 2)
|
| 228 |
tooltip = f"{label_desc} ({truncated_score:.2f})"
|
| 229 |
color = ENTITY_COLORS.get(label, "#cccccc")
|
| 230 |
+
|
| 231 |
html += line[last_end:start]
|
| 232 |
|
| 233 |
should_anonymize = any(
|
| 234 |
label in entity_importance[level] for level in importance_levels
|
| 235 |
)
|
| 236 |
+
|
| 237 |
if should_anonymize:
|
| 238 |
key = (ent["word"].lower(), label)
|
| 239 |
if key not in anonymized_map:
|
|
|
|
| 241 |
suffix = chr(ord("A") + count)
|
| 242 |
label_counters[label] = count + 1
|
| 243 |
anonymized_map[key] = suffix
|
| 244 |
+
|
| 245 |
suffix = anonymized_map[key]
|
| 246 |
display = f"{label_desc} {suffix}"
|
| 247 |
+
|
| 248 |
normalized_word = ent["word"].strip().lower()
|
| 249 |
display_key = f"{label_desc} {suffix} : {normalized_word}"
|
| 250 |
+
|
| 251 |
+
if allowed_keys and display_key not in allowed_keys:
|
| 252 |
display = ent["word"]
|
| 253 |
style = ""
|
| 254 |
css_class = "entity"
|
|
|
|
| 260 |
style = ""
|
| 261 |
css_class = "entity"
|
| 262 |
|
| 263 |
+
html += f'<span class="{css_class}" style="{style}" title="{tooltip}">{display}</span>'
|
| 264 |
last_end = end
|
| 265 |
|
| 266 |
html += line[last_end:]
|
|
|
|
| 277 |
st.session_state.manual_phrases = []
|
| 278 |
|
| 279 |
st.markdown("#### Juristische Anonymisierung")
|
| 280 |
+
|
| 281 |
uploaded_file = st.file_uploader(
|
| 282 |
"Bitte laden Sie eine .txt-Datei hoch:", type="txt"
|
| 283 |
)
|
|
|
|
| 298 |
ent_list = [entity_labels[k] for k in entity_importance[level]]
|
| 299 |
st.markdown(f"**{label}**: {', '.join(ent_list)}")
|
| 300 |
|
| 301 |
+
threshold = st.slider("Schwellenwert für das Modellvertrauen:", 0.0, 1.0, 0.5, 0.01)
|
| 302 |
+
|
| 303 |
st.markdown("---")
|
| 304 |
|
| 305 |
if uploaded_file:
|
|
|
|
| 313 |
with st.spinner("Modell wird einmalig auf die Datei angewendet..."):
|
| 314 |
merged_all_lines = ner_merge_lines(text)
|
| 315 |
|
| 316 |
+
# Manual phrases to RED
|
| 317 |
manual_phrases = st.session_state.manual_phrases
|
| 318 |
overlap_warnings = set()
|
|
|
|
| 319 |
for idx, (line, merged) in enumerate(merged_all_lines):
|
| 320 |
for phrase in manual_phrases:
|
| 321 |
for match in re.finditer(re.escape(phrase), line.lower()):
|
| 322 |
start, end = match.start(), match.end()
|
|
|
|
| 323 |
if any(start < e["end"] and end > e["start"] for e in merged):
|
| 324 |
overlap_warnings.add(phrase)
|
| 325 |
continue
|
|
|
|
| 326 |
merged.append(
|
| 327 |
{
|
| 328 |
"start": start,
|
|
|
|
| 333 |
"index": 9999,
|
| 334 |
}
|
| 335 |
)
|
|
|
|
| 336 |
merged_all_lines[idx] = (line, sorted(merged, key=lambda x: x["start"]))
|
| 337 |
|
| 338 |
+
# Grouping layer for the sidebar
|
| 339 |
+
groups = defaultdict(
|
| 340 |
+
lambda: {"variants": set(), "displays": set(), "rep": None}
|
| 341 |
+
)
|
| 342 |
|
| 343 |
for _, merged in merged_all_lines:
|
| 344 |
for ent in merged:
|
| 345 |
label = ent["entity"].split("-")[-1]
|
| 346 |
if any(label in entity_importance[lvl] for lvl in importance_levels):
|
| 347 |
+
|
| 348 |
+
variant_norm = ent["word"].strip().lower()
|
| 349 |
+
canon_key, canon_label, display_key = canonical_key(
|
| 350 |
+
ent["word"], label
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
g = groups[(canon_key, canon_label)]
|
| 354 |
+
g["variants"].add(variant_norm)
|
| 355 |
+
g["displays"].add(display_key)
|
| 356 |
+
|
| 357 |
+
# Suffix per canonical group
|
| 358 |
+
label_counters_for_groups = {}
|
| 359 |
+
for (canon_text, label), data in groups.items():
|
| 360 |
+
count = label_counters_for_groups.get(label, 0)
|
| 361 |
+
suffix = chr(ord("A") + count)
|
| 362 |
+
label_counters_for_groups[label] = count + 1
|
| 363 |
+
data["suffix"] = suffix
|
| 364 |
+
|
| 365 |
+
for key, data in groups.items():
|
| 366 |
+
if data["displays"]:
|
| 367 |
+
data["rep"] = max(data["displays"], key=len)
|
| 368 |
+
else:
|
| 369 |
+
data["rep"] = ""
|
| 370 |
+
|
| 371 |
+
anonymized_map = {}
|
| 372 |
+
for (canon_text, label), data in groups.items():
|
| 373 |
+
suffix = data["suffix"]
|
| 374 |
+
for v in data["variants"]:
|
| 375 |
+
anonymized_map[(v, label)] = suffix
|
| 376 |
+
|
| 377 |
+
entity_labels_map = entity_labels
|
| 378 |
+
display_to_variants = {}
|
| 379 |
+
groups_by_label_desc = defaultdict(list)
|
| 380 |
+
all_display_keys = set()
|
| 381 |
+
|
| 382 |
+
for (canon_text, label), data in groups.items():
|
| 383 |
+
label_desc = entity_labels_map.get(label, label)
|
| 384 |
+
suffix = data["suffix"]
|
| 385 |
+
shown = f"{label_desc} {suffix} : {data['rep']}"
|
| 386 |
+
groups_by_label_desc[label_desc].append(shown)
|
| 387 |
+
display_keys = [f"{label_desc} {suffix} : {v}" for v in data["variants"]]
|
| 388 |
+
display_to_variants[shown] = display_keys
|
| 389 |
+
all_display_keys.update(display_keys)
|
| 390 |
+
|
| 391 |
+
label_order = [
|
| 392 |
+
"RS",
|
| 393 |
+
"GS",
|
| 394 |
+
"PER",
|
| 395 |
+
"AN",
|
| 396 |
+
"GRT",
|
| 397 |
+
"VO",
|
| 398 |
+
"VS",
|
| 399 |
+
"VT",
|
| 400 |
+
"EUN",
|
| 401 |
+
"LIT",
|
| 402 |
+
"UN",
|
| 403 |
+
"INN",
|
| 404 |
+
"ORG",
|
| 405 |
+
"MRK",
|
| 406 |
+
"RR",
|
| 407 |
+
"LD",
|
| 408 |
+
"LDS",
|
| 409 |
+
"ST",
|
| 410 |
+
"STR",
|
| 411 |
+
"RED",
|
| 412 |
+
]
|
| 413 |
+
label_order_desc = [entity_labels_map.get(x, x) for x in label_order]
|
| 414 |
|
| 415 |
with st.sidebar:
|
| 416 |
st.markdown("### Neue Phrase schwärzen:")
|
|
|
|
| 417 |
if "manual_phrases" not in st.session_state:
|
| 418 |
st.session_state.manual_phrases = []
|
| 419 |
|
| 420 |
with st.form("manual_add_form"):
|
| 421 |
new_phrase = st.text_input("Neue Phrase:")
|
| 422 |
submitted = st.form_submit_button("Hinzufügen")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
+
with st.sidebar.expander("Hinweise zu manuellen Phrasen", expanded=False):
|
| 425 |
+
st.markdown("**Noch in Entwicklung**")
|
| 426 |
+
st.markdown(
|
| 427 |
+
"_Manuelle Schwärzungen können fehlschlagen, wenn sich die Phrase "
|
| 428 |
+
"mit bereits erkannten Entitäten überschneidet oder über mehrere "
|
| 429 |
+
"Zeilen erstreckt._"
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
if submitted and new_phrase.strip():
|
| 433 |
+
cleaned = new_phrase.strip().lower()
|
| 434 |
+
if cleaned not in st.session_state.manual_phrases:
|
| 435 |
+
st.session_state.manual_phrases.append(cleaned)
|
| 436 |
+
st.rerun()
|
| 437 |
|
| 438 |
st.markdown("---")
|
| 439 |
st.markdown("### Anonymisierte Entitäten verwalten:")
|
| 440 |
+
|
| 441 |
+
selected_canon = []
|
| 442 |
+
for lab_desc in label_order_desc:
|
| 443 |
+
items = groups_by_label_desc.get(lab_desc, [])
|
| 444 |
+
if not items:
|
| 445 |
+
continue
|
| 446 |
+
st.markdown(f"**{lab_desc}**")
|
| 447 |
+
for shown in sorted(items, key=str.lower):
|
| 448 |
+
checked = st.checkbox(shown, value=True, key=f"chk::{shown}")
|
| 449 |
+
if checked:
|
| 450 |
+
selected_canon.append(shown)
|
| 451 |
+
|
| 452 |
+
if not selected_canon and groups_by_label_desc:
|
| 453 |
+
selected_canon = [
|
| 454 |
+
x for items in groups_by_label_desc.values() for x in items
|
| 455 |
+
]
|
| 456 |
+
|
| 457 |
+
allowed_keys = set()
|
| 458 |
+
for shown in selected_canon:
|
| 459 |
+
allowed_keys.update(display_to_variants.get(shown, []))
|
| 460 |
+
|
| 461 |
+
if not allowed_keys and all_display_keys:
|
| 462 |
+
allowed_keys = set(all_display_keys)
|
| 463 |
+
|
| 464 |
+
label_counters_runtime = {}
|
| 465 |
|
| 466 |
anonymized_lines = []
|
| 467 |
for line, merged in merged_all_lines:
|
| 468 |
if not line.strip():
|
| 469 |
+
st.markdown("<br/>", unsafe_allow_html=True)
|
| 470 |
anonymized_lines.append("")
|
| 471 |
continue
|
| 472 |
|
|
|
|
| 475 |
merged,
|
| 476 |
importance_levels,
|
| 477 |
threshold,
|
| 478 |
+
label_counters_runtime,
|
| 479 |
anonymized_map,
|
| 480 |
+
allowed_keys,
|
| 481 |
entity_labels,
|
| 482 |
entity_importance,
|
| 483 |
ENTITY_COLORS,
|
| 484 |
)
|
| 485 |
+
|
| 486 |
st.markdown(
|
| 487 |
+
f'<div style="white-space: pre-wrap;">{html_line}</div>',
|
| 488 |
unsafe_allow_html=True,
|
| 489 |
)
|
| 490 |
+
|
| 491 |
+
cleaned = re.sub(r"<!--.*?-->", "", html_line, flags=re.DOTALL)
|
| 492 |
text_only = re.sub(r"<[^>]+>", "", cleaned)
|
| 493 |
anonymized_lines.append(text_only.strip())
|
| 494 |
|