""" This code was adapted from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/ """ import streamlit as st import json import pandas as pd st.set_page_config(page_title="PII Visualization", layout="wide") st.title("PII Visualization") tags = ["KEY", "IP_ADDRESS", "EMAIL"] types = ["False positives", "False negatives"] matches = {"False negatives": "fn", "False positives": "fp"} @st.cache() def load_data(): with open(f"data/{chosen_tag.lower()}_detections_{matches[chosen_type]}.json", "r") as f: samples = json.load(f) return samples col1, col2, col3 = st.columns([1, 1, 4]) with col1: chosen_type = st.selectbox( label="Select the type of detections", options=types, index=0) with col2: chosen_tag = st.selectbox( label="Select the PII TAG", options=tags, index=0) samples = load_data() max_docs = len(samples) col1, col2 = st.columns([2, 4]) with col1: index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1) st.write("Scroll down to visualize PII detections highlighted in yellow, we split the text at the start and end of the key to highlight it.") detection = samples[index_example] delimiter = f"PI:{matches[chosen_type].upper()}" count = detection.count(delimiter) st.subheader(f"{count} {chosen_type.lower()} for {chosen_tag} tag in example {index_example}:") subparts = [] advance, found = 0, 0 last_part = detection while found < count: start = advance + last_part.index(delimiter) end = advance + last_part.index("END_PI")+ 6 st.code(detection[advance:start]) st.markdown(""+detection[start:end]+"", unsafe_allow_html=True) last_part = detection[end:] advance = end found += 1 st.code(last_part)