loubnabnl HF staff loubnabnl HF staff commited on
Commit
4c12510
0 Parent(s):

Duplicate from bigcode/pii-visualization

Browse files

Co-authored-by: loubna ben allal <loubnabnl@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +56 -0
  4. requirements.txt +1 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PII Visualization
3
+ emoji: 🕵️
4
+ colorFrom: pink
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: bigcode/pii-visualization
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This code was inspired from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
3
+ and https://huggingface.co/spaces/SaulLu/diff-visualizer
4
+ """
5
+
6
+ import streamlit as st
7
+ from datasets import load_dataset
8
+ import diff_viewer
9
+ import os
10
+
11
+ st.set_page_config(page_title="PII Visualization", layout="wide")
12
+ st.title("PII Visualization")
13
+ auth_token = os.environ.get("data-pii") or True
14
+
15
+ @st.cache()
16
+ def load_data(language="python"):
17
+ # load dataset with modified files with: content, references and language columns
18
+ dataset = load_dataset("bigcode/pii_checks_python_java_js", data_dir = f"data/{language}", split="train", use_auth_token=auth_token)
19
+ dataset = dataset.remove_columns(['has_secrets', 'number_secrets', 'path'])
20
+ return dataset
21
+
22
+
23
+ def get_samples_tag(dataset, tag):
24
+ # get samples with tag
25
+ # add column id to be able to retrieve the sample
26
+ tmp = dataset.add_column("index", range(len(dataset)))
27
+ samples = tmp.filter(lambda x: "PI:" + tag.upper() in x['references'])
28
+ return samples["index"]
29
+
30
+
31
+ col1, col2 = st.columns([2, 4])
32
+ with col1:
33
+ lang = st.selectbox("Select a programming language", ["Python", "Java", "JavaScript"])
34
+
35
+ samples = load_data(language=lang.lower())
36
+ max_docs = len(samples)
37
+
38
+ with col1:
39
+ index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)
40
+
41
+ keys = get_samples_tag(samples, "KEY")
42
+ ips = get_samples_tag(samples, "IP_ADDRESS")
43
+
44
+ st.write("Here we highlight the difference in code before and after the PII redaction on the Python, Java and Javascript subsets of the-stack-smol. We only show files that were modified.")
45
+
46
+ example = samples[index_example]
47
+ delimiter = f"PI:"
48
+ count = example["references"].count(delimiter)
49
+
50
+ secrets = "secret" if count == 1 else "secrets"
51
+ st.subheader(f"{lang} example {index_example} has {count} redacted {secrets}:")
52
+ diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")
53
+ #diff_viewer.diff_viewer(old_text=example["content"], new_text=example["references"], lang="none")
54
+
55
+ st.markdown(f"Email redaction examples are very common unlike **IP addresses** and **keys**. To visualize them you can check these samples:")
56
+ st.text(f"IP addresses:\n{ips}\nKeys:\n{keys}")
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit_diff_viewer==0.0.2