loubnabnl's picture
loubnabnl HF staff
Duplicate from bigcode/pii-test
461c45d
raw
history blame
No virus
1.87 kB
"""
This code was adapted from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
"""
import streamlit as st
import json
import pandas as pd
st.set_page_config(page_title="PII Visualization", layout="wide")
st.title("PII Visualization")
tags = ["KEY", "IP_ADDRESS", "EMAIL"]
types = ["False positives", "False negatives"]
matches = {"False negatives": "fn", "False positives": "fp"}
@st.cache()
def load_data():
with open(f"data/{chosen_tag.lower()}_detections_{matches[chosen_type]}.json", "r") as f:
samples = json.load(f)
return samples
col1, col2, col3 = st.columns([1, 1, 4])
with col1:
chosen_type = st.selectbox(
label="Select the type of detections",
options=types,
index=0)
with col2:
chosen_tag = st.selectbox(
label="Select the PII TAG",
options=tags,
index=0)
samples = load_data()
max_docs = len(samples)
col1, col2 = st.columns([2, 4])
with col1:
index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)
st.write("Scroll down to visualize PII detections highlighted in yellow, we split the text at the start and end of the key to highlight it.")
detection = samples[index_example]
delimiter = f"PI:{matches[chosen_type].upper()}"
count = detection.count(delimiter)
st.subheader(f"{count} {chosen_type.lower()} for {chosen_tag} tag in example {index_example}:")
subparts = []
advance, found = 0, 0
last_part = detection
while found < count:
start = advance + last_part.index(delimiter)
end = advance + last_part.index("END_PI")+ 6
st.code(detection[advance:start])
st.markdown("<span style=\"background-color: #FFFF00\">"+detection[start:end]+"</span>", unsafe_allow_html=True)
last_part = detection[end:]
advance = end
found += 1
st.code(last_part)