pii-public-demo

Runtime error

App Files Files Community

loubnabnl HF staff commited on Nov 21, 2022

Commit

bb169af

•

1 Parent(s): 4c12510

setup space and add examples

Browse files

Files changed (2) hide show

app.py +14 -17
data/pii_examples.json +4 -0

app.py CHANGED Viewed

@@ -6,17 +6,17 @@ and https://huggingface.co/spaces/SaulLu/diff-visualizer
 import streamlit as st
 from datasets import load_dataset
 import diff_viewer
-import os
 st.set_page_config(page_title="PII Visualization", layout="wide")
-st.title("PII Visualization")
-auth_token = os.environ.get("data-pii") or True
 @st.cache()
 def load_data(language="python"):
     # load dataset with modified files with: content, references and language columns
-    dataset = load_dataset("bigcode/pii_checks_python_java_js", data_dir = f"data/{language}", split="train", use_auth_token=auth_token)
-    dataset = dataset.remove_columns(['has_secrets', 'number_secrets', 'path'])
     return dataset
@@ -30,27 +30,24 @@ def get_samples_tag(dataset, tag):
 col1, col2 = st.columns([2, 4])
 with col1:
-    lang = st.selectbox("Select a programming language", ["Python", "Java", "JavaScript"])
 samples = load_data(language=lang.lower())
 max_docs = len(samples)
 with col1:
-    index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)
-keys = get_samples_tag(samples, "KEY")
-ips = get_samples_tag(samples, "IP_ADDRESS")
-st.write("Here we highlight the difference in code before and after the PII redaction on the Python, Java and Javascript subsets of the-stack-smol. We only show files that were modified.")
 example = samples[index_example]
 delimiter = f"PI:"
 count = example["references"].count(delimiter)
-secrets = "secret" if count == 1 else "secrets"
-st.subheader(f"{lang} example {index_example} has {count} redacted {secrets}:")
-diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")
-#diff_viewer.diff_viewer(old_text=example["content"], new_text=example["references"], lang="none")
-st.markdown(f"Email redaction examples are very common unlike **IP addresses** and **keys**. To visualize them you can check these samples:")
-st.text(f"IP addresses:\n{ips}\nKeys:\n{keys}")

 import streamlit as st
 from datasets import load_dataset
 import diff_viewer
 st.set_page_config(page_title="PII Visualization", layout="wide")
+st.title("PII Anonymization 🔐")
+st.markdown("This demo allows the visualization of personal information anonymization on some code files. \
+    This is just an illustration of BigCode's PII pipeline and the examples and secrets are synthetic.")
 @st.cache()
 def load_data(language="python"):
     # load dataset with modified files with: content, references and language columns
+    dataset = load_dataset("data", split="train")
     return dataset
 col1, col2 = st.columns([2, 4])
 with col1:
+    lang = st.selectbox("Select a programming language", ["Python"])
 samples = load_data(language=lang.lower())
 max_docs = len(samples)
 with col1:
+    index_example = st.number_input(f"Choose an example from the existing {max_docs}:", min_value=0, max_value=max_docs-1, value=0, step=1)
+st.markdown("Below we highlight the difference in code before and after the PII on the chosen synthetic example:")
 example = samples[index_example]
 delimiter = f"PI:"
 count = example["references"].count(delimiter)
+col1, col2, col3 = st.columns([0.4, 1, 1])
+with col2:
+    st.subheader(f"Code before PII redaction")
+with col3:
+    st.subheader(f"Code after PII redaction")
+diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")

data/pii_examples.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{"content":"# Copyright 2022 <connection@test.mail>\n\nimport getpass\nimport telnetlib\n\nuser = input(\"Enter your remote account: \")\npassword = getpass.getpass()\n\naddress_1 = \"132.12.67.81\"\naddress_2 = \"142.12.67.81\"\n\ntn = telnetlib.Telnet(address_1)\n\ntn.read_until(b\"login: \")\ntn.write(user.encode('ascii') + b\"\n\")\nif password:\n    tn.read_until(b\"Password: \")\n    tn.write(password.encode('ascii') + b\"\n\")\n\ntn.write(b\"ls\n\")\ntn.write(b\"exit\n\")\n\nprint(tn.read_all().decode('ascii'))","secrets":"[{\"tag\": \"IP_ADDRESS\", \"value\": \"132.12.67.81\", \"start\": 161, \"end\": 173}, {\"tag\": \"IP_ADDRESS\", \"value\": \"142.12.67.81\", \"start\": 188, \"end\": 200}, {\"tag\": \"EMAIL\", \"value\": \"connection@test.mail\", \"start\": 18, \"end\": 38}]","has_secrets":true,"number_secrets":3,"new_content":"# Copyright 2022 <anpch@example.com>\n\nimport getpass\nimport telnetlib\n\nuser = input(\"Enter your remote account: \")\npassword = getpass.getpass()\n\naddress_1 = \"192.168.3.11\"\naddress_2 = \"172.16.31.10\"\n\ntn = telnetlib.Telnet(address_1)\n\ntn.read_until(b\"login: \")\ntn.write(user.encode('ascii') + b\"\n\")\nif password:\n    tn.read_until(b\"Password: \")\n    tn.write(password.encode('ascii') + b\"\n\")\n\ntn.write(b\"ls\n\")\ntn.write(b\"exit\n\")\n\nprint(tn.read_all().decode('ascii'))","modified":true,"references":"# Copyright 2022 <PI:EMAIL:anpch@example.comEND_PI>\n\nimport getpass\nimport telnetlib\n\nuser = input(\"Enter your remote account: \")\npassword = getpass.getpass()\n\naddress_1 = \"PI:IP_ADDRESS:192.168.3.11END_PI\"\naddress_2 = \"PI:IP_ADDRESS:172.16.31.10END_PI\"\n\ntn = telnetlib.Telnet(address_1)\n\ntn.read_until(b\"login: \")\ntn.write(user.encode('ascii') + b\"\n\")\nif password:\n    tn.read_until(b\"Password: \")\n    tn.write(password.encode('ascii') + b\"\n\")\n\ntn.write(b\"ls\n\")\ntn.write(b\"exit\n\")\n\nprint(tn.read_all().decode('ascii'))"}
+{"content":"'''Generating embeddings with Node2Vec for a graph'''\n\n# Copyright 2022 <loubna@test.mail> or <contact@test.mail>\n\nimport os\nimport gzip\nimport pickle\nfrom tqdm import tqdm\nimport networkx as nx\nfrom nodevectors import Node2Vec\nimport argparse\nfrom datasets import load_dataset\nfrom huggingface_hub import Repository\n\ndef create_node_embeddings(args):\n    G = nx.read_edgelist(args.path_graph, delimiter=',', create_using=nx.Graph(), nodetype=int)\n    nodes = list(G.nodes())\n    n = G.number_of_nodes()\n    m = G.number_of_edges()\n    \n    print(\"Training Node2vec...\")\n    model = Node2Vec(n_components=args.n_components, \n                walklen=40,\n                epochs=30,\n                threads=os.cpu_count())  \n\n    model.fit(G)\n\n    node_embeddings = dict()\n    for node in tqdm(nodes):\n        node_embeddings[node] = model.predict(node)\n\n    repo = Repository(\n        local_dir=\"dataset\",\n        clone_from=\"loubna\/graph_dataset\",\n        token=True,\n        repo_type=\"dataset\",\n        git_user=\"user\",\n        use_auth_token=\"tk_XKyihy89nl89hgEMOskeTHCRZEkBd7\",\n        git_email=\"loubna@test.mail\"\n    )\n\n    print(\"----- Saving the Embeddings -----\")\n    file = gzip.GzipFile(\"dataset\/node_embeddings.emb\", 'wb')\n    file.write(pickle.dumps(node_embeddings))\n    file.close()\n\n\n\nrepo = Repository(local_dir=\"huggingface-hub\", clone_from=\"https:\/\/huggingface.co\/facebook\/wav2vec2-large-960h-lv60\")\nif __name__ ==\"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--path_graph\", type=str, default=\"data\/edgelist.txt\", \n        help=\"Path to the graph edges text file\")\n    parser.add_argument(\"--n_components\", type=int, default=20,\n                        help=\"Size of the embedding\")\n    \n    create_node_embeddings(parser.parse_args())","secrets":"[{\"tag\": \"EMAIL\", \"value\": \"loubna@test.mail\", \"start\": 73, \"end\": 89}, {\"tag\": \"EMAIL\", \"value\": \"contact@test.mail\", \"start\": 95, \"end\": 112}, {\"tag\": \"EMAIL\", \"value\": \"loubna@test.mail\", \"start\": 1100, \"end\": 1116}, {\"tag\": \"KEY\", \"value\": \"tk_XKyihy89nl89hgEMOskeTHCRZEkBd7\", \"start\": 1045, \"end\": 1078}]","has_secrets":true,"number_secrets":4,"new_content":"'''Generating embeddings with Node2Vec for a graph'''\n\n# Copyright 2022 <hzdkv@example.com> or <anpch@example.com>\n\nimport os\nimport gzip\nimport pickle\nfrom tqdm import tqdm\nimport networkx as nx\nfrom nodevectors import Node2Vec\nimport argparse\nfrom datasets import load_dataset\nfrom huggingface_hub import Repository\n\ndef create_node_embeddings(args):\n    G = nx.read_edgelist(args.path_graph, delimiter=',', create_using=nx.Graph(), nodetype=int)\n    nodes = list(G.nodes())\n    n = G.number_of_nodes()\n    m = G.number_of_edges()\n    \n    print(\"Training Node2vec...\")\n    model = Node2Vec(n_components=args.n_components, \n                walklen=40,\n                epochs=30,\n                threads=os.cpu_count())  \n\n    model.fit(G)\n\n    node_embeddings = dict()\n    for node in tqdm(nodes):\n        node_embeddings[node] = model.predict(node)\n\n    repo = Repository(\n        local_dir=\"dataset\",\n        clone_from=\"loubna\/graph_dataset\",\n        token=True,\n        repo_type=\"dataset\",\n        git_user=\"user\",\n        use_auth_token=\"ngw6fo1pu3tjgnp9jnlp7vnwvfqb9yn7\",\n        git_email=\"hzdkv@example.com\"\n    )\n\n    print(\"----- Saving the Embeddings -----\")\n    file = gzip.GzipFile(\"dataset\/node_embeddings.emb\", 'wb')\n    file.write(pickle.dumps(node_embeddings))\n    file.close()\n\n\n\nrepo = Repository(local_dir=\"huggingface-hub\", clone_from=\"https:\/\/huggingface.co\/facebook\/wav2vec2-large-960h-lv60\")\nif __name__ ==\"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--path_graph\", type=str, default=\"data\/edgelist.txt\", \n        help=\"Path to the graph edges text file\")\n    parser.add_argument(\"--n_components\", type=int, default=20,\n                        help=\"Size of the embedding\")\n    \n    create_node_embeddings(parser.parse_args())","modified":true,"references":"'''Generating embeddings with Node2Vec for a graph'''\n\n# Copyright 2022 <PI:EMAIL:hzdkv@example.comEND_PI> or <PI:EMAIL:anpch@example.comEND_PI>\n\nimport os\nimport gzip\nimport pickle\nfrom tqdm import tqdm\nimport networkx as nx\nfrom nodevectors import Node2Vec\nimport argparse\nfrom datasets import load_dataset\nfrom huggingface_hub import Repository\n\ndef create_node_embeddings(args):\n    G = nx.read_edgelist(args.path_graph, delimiter=',', create_using=nx.Graph(), nodetype=int)\n    nodes = list(G.nodes())\n    n = G.number_of_nodes()\n    m = G.number_of_edges()\n    \n    print(\"Training Node2vec...\")\n    model = Node2Vec(n_components=args.n_components, \n                walklen=40,\n                epochs=30,\n                threads=os.cpu_count())  \n\n    model.fit(G)\n\n    node_embeddings = dict()\n    for node in tqdm(nodes):\n        node_embeddings[node] = model.predict(node)\n\n    repo = Repository(\n        local_dir=\"dataset\",\n        clone_from=\"loubna\/graph_dataset\",\n        token=True,\n        repo_type=\"dataset\",\n        git_user=\"user\",\n        use_auth_token=\"PI:KEY:ngw6fo1pu3tjgnp9jnlp7vnwvfqb9yn7END_PI\",\n        git_email=\"PI:EMAIL:hzdkv@example.comEND_PI\"\n    )\n\n    print(\"----- Saving the Embeddings -----\")\n    file = gzip.GzipFile(\"dataset\/node_embeddings.emb\", 'wb')\n    file.write(pickle.dumps(node_embeddings))\n    file.close()\n\n\n\nrepo = Repository(local_dir=\"huggingface-hub\", clone_from=\"https:\/\/huggingface.co\/facebook\/wav2vec2-large-960h-lv60\")\nif __name__ ==\"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--path_graph\", type=str, default=\"data\/edgelist.txt\", \n        help=\"Path to the graph edges text file\")\n    parser.add_argument(\"--n_components\", type=int, default=20,\n                        help=\"Size of the embedding\")\n    \n    create_node_embeddings(parser.parse_args())"}
+{"content":"import boto3\nimport os\nimport subprocess\nimport platform\nimport sys\n\nclient = boto3.client(\n    's3',\n    aws_access_key_id=\"AKIAIOSF889BG8BHRDYNN7EXAMPLE\",\n    aws_secret_access_key=\"rt\/YUAmye9BxUBPl89hgEMOTZlmnKYV67H5gJD8\",\n)\n\ndef execute_command(command, args):\n    try:\n        aws_cmd = \"aws\" if platform.system().lower() != \"windows\" else \"aws.cmd\"\n        command_with_args = [aws_cmd, \"cloudformation\", command] + list(args)\n        subprocess.check_call(command_with_args)\n\n    except subprocess.CalledProcessError as e:\n        sys.exit(e.returncode)","secrets":"[{\"tag\": \"KEY\", \"value\": \"rt\/YUAmye9BxUBPl89hgEMOTZlmnKYV67H5gJD8\", \"start\": 184, \"end\": 223}, {\"tag\": \"KEY\", \"value\": \"AKIAIOSF889BG8BHRDYN\", \"start\": 125, \"end\": 145}]","has_secrets":true,"number_secrets":2,"new_content":"import boto3\nimport os\nimport subprocess\nimport platform\nimport sys\n\nclient = boto3.client(\n    's3',\n    aws_access_key_id=\"74t3tndxag9o7h0890bnpfzh4olk2h9xN7EXAMPLE\",\n    aws_secret_access_key=\"kgfhvu9qnh3mr6eel97y6fq2hezzol8z\",\n)\n\ndef execute_command(command, args):\n    try:\n        aws_cmd = \"aws\" if platform.system().lower() != \"windows\" else \"aws.cmd\"\n        command_with_args = [aws_cmd, \"cloudformation\", command] + list(args)\n        subprocess.check_call(command_with_args)\n\n    except subprocess.CalledProcessError as e:\n        sys.exit(e.returncode)","modified":true,"references":"import boto3\nimport os\nimport subprocess\nimport platform\nimport sys\n\nclient = boto3.client(\n    's3',\n    aws_access_key_id=\"PI:KEY:74t3tndxag9o7h0890bnpfzh4olk2h9xEND_PIN7EXAMPLE\",\n    aws_secret_access_key=\"PI:KEY:kgfhvu9qnh3mr6eel97y6fq2hezzol8zEND_PI\",\n)\n\ndef execute_command(command, args):\n    try:\n        aws_cmd = \"aws\" if platform.system().lower() != \"windows\" else \"aws.cmd\"\n        command_with_args = [aws_cmd, \"cloudformation\", command] + list(args)\n        subprocess.check_call(command_with_args)\n\n    except subprocess.CalledProcessError as e:\n        sys.exit(e.returncode)"}
+{"content":"import setuptools\n\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as fh:\n    description = fh.read()\n\nsetuptools.setup(\n    name=\"project\",\n    author=\"maintainer\",\n    author_email=\"maintainer@email.com\",\n    description=\"NLP Project\",\n    long_description=description,\n    long_description_content_type=\"text\/markdown\",\n    url=\"https:\/\/github.com\/projects-testing45\/tree\/main\/nlp-course1\/nlp\/\",\n    python_requires=\">=3.6\",\n)","secrets":"[{\"tag\": \"EMAIL\", \"value\": \"maintainer@email.com\", \"start\": 182, \"end\": 202}]","has_secrets":true,"number_secrets":1,"new_content":"import setuptools\n\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as fh:\n    description = fh.read()\n\nsetuptools.setup(\n    name=\"project\",\n    author=\"maintainer\",\n    author_email=\"nnheo@example.com\",\n    description=\"NLP Project\",\n    long_description=description,\n    long_description_content_type=\"text\/markdown\",\n    url=\"https:\/\/github.com\/projects-testing45\/tree\/main\/nlp-course1\/nlp\/\",\n    python_requires=\">=3.6\",\n)","modified":true,"references":"import setuptools\n\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as fh:\n    description = fh.read()\n\nsetuptools.setup(\n    name=\"project\",\n    author=\"maintainer\",\n    author_email=\"PI:EMAIL:nnheo@example.comEND_PI\",\n    description=\"NLP Project\",\n    long_description=description,\n    long_description_content_type=\"text\/markdown\",\n    url=\"https:\/\/github.com\/projects-testing45\/tree\/main\/nlp-course1\/nlp\/\",\n    python_requires=\">=3.6\",\n)"}