pii-public-demo / app.py
loubnabnl's picture
loubnabnl HF Staff
update pii link
a44cfc4
"""
This code was inspired from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
and https://huggingface.co/spaces/SaulLu/diff-visualizer
"""
import streamlit as st
from datasets import load_dataset
import diff_viewer
st.set_page_config(page_title="PII Visualization", layout="wide")
st.title("PII Anonymization 🔐")
st.markdown("This demo allows the visualization of personal information anonymization on some code files. \
This is just an illustration of [BigCode's PII pipeline](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii) results and the examples and secrets are **synthetic**.")
@st.cache()
def load_data(language="python"):
# load dataset with modified files with: content, references and language columns
dataset = load_dataset("data", split="train")
return dataset
def get_samples_tag(dataset, tag):
# add column id to be able to retrieve the sample
tmp = dataset.add_column("index", range(len(dataset)))
samples = tmp.filter(lambda x: "PI:" + tag.upper() in x['references'])
return samples["index"]
col1, col2 = st.columns([2, 4])
with col1:
#TODO add examples in more languages
lang = st.selectbox("Select a programming language", ["Python"])
samples = load_data(language=lang.lower())
max_docs = len(samples)
with col1:
index_example = st.number_input(f"Choose an example from the existing {max_docs}:", min_value=0, max_value=max_docs-1, value=0, step=1)
st.markdown("Below we highlight the difference in code before and after the PII on the chosen synthetic example:")
example = samples[index_example]
delimiter = f"PI:"
count = example["references"].count(delimiter)
col1, col2, col3 = st.columns([0.4, 1, 1])
with col2:
st.subheader(f"Code before PII redaction")
with col3:
st.subheader(f"Code after PII redaction")
diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")