File size: 1,944 Bytes
4c12510
 
 
 
 
 
 
 
 
 
bb169af
 
 
a44cfc4
4c12510
 
9f6f218
4c12510
bb169af
4c12510
 
 
 
 
 
 
 
 
 
 
 
efd78f7
bb169af
4c12510
 
 
 
 
bb169af
4c12510
 
bb169af
4c12510
 
 
 
 
bb169af
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
This code was inspired from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
and https://huggingface.co/spaces/SaulLu/diff-visualizer
"""

import streamlit as st
from datasets import load_dataset
import diff_viewer

st.set_page_config(page_title="PII Visualization", layout="wide")
st.title("PII Anonymization 🔐")

st.markdown("This demo allows the visualization of personal information anonymization on some code files. \
    This is just an illustration of [BigCode's PII pipeline](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii) results and the examples and secrets are **synthetic**.")

@st.cache()
def load_data(language="python"):
    # load dataset with modified files with: content, references and language columns
    dataset = load_dataset("data", split="train")
    return dataset 


def get_samples_tag(dataset, tag):
    # add column id to be able to retrieve the sample
    tmp = dataset.add_column("index", range(len(dataset)))
    samples = tmp.filter(lambda x: "PI:" + tag.upper() in x['references'])
    return samples["index"]


col1, col2 = st.columns([2, 4])
with col1:
    #TODO add examples in more languages
    lang = st.selectbox("Select a programming language", ["Python"])

samples = load_data(language=lang.lower())
max_docs = len(samples)

with col1:
    index_example = st.number_input(f"Choose an example from the existing {max_docs}:", min_value=0, max_value=max_docs-1, value=0, step=1)


st.markdown("Below we highlight the difference in code before and after the PII on the chosen synthetic example:")

example = samples[index_example]
delimiter = f"PI:"
count = example["references"].count(delimiter)

col1, col2, col3 = st.columns([0.4, 1, 1])
with col2:
    st.subheader(f"Code before PII redaction")  
with col3:
    st.subheader(f"Code after PII redaction")
diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")