""" This code was inspired from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/ and https://huggingface.co/spaces/SaulLu/diff-visualizer """ import streamlit as st from datasets import load_dataset import diff_viewer st.set_page_config(page_title="PII Visualization", layout="wide") st.title("PII Anonymization 🔐") st.markdown("This demo allows the visualization of personal information anonymization on some code files. \ This is just an illustration of [BigCode's PII pipeline](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii) results and the examples and secrets are **synthetic**.") @st.cache() def load_data(language="python"): # load dataset with modified files with: content, references and language columns dataset = load_dataset("data", split="train") return dataset def get_samples_tag(dataset, tag): # add column id to be able to retrieve the sample tmp = dataset.add_column("index", range(len(dataset))) samples = tmp.filter(lambda x: "PI:" + tag.upper() in x['references']) return samples["index"] col1, col2 = st.columns([2, 4]) with col1: #TODO add examples in more languages lang = st.selectbox("Select a programming language", ["Python"]) samples = load_data(language=lang.lower()) max_docs = len(samples) with col1: index_example = st.number_input(f"Choose an example from the existing {max_docs}:", min_value=0, max_value=max_docs-1, value=0, step=1) st.markdown("Below we highlight the difference in code before and after the PII on the chosen synthetic example:") example = samples[index_example] delimiter = f"PI:" count = example["references"].count(delimiter) col1, col2, col3 = st.columns([0.4, 1, 1]) with col2: st.subheader(f"Code before PII redaction") with col3: st.subheader(f"Code after PII redaction") diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")