loubnabnl HF staff commited on
Commit
bb169af
1 Parent(s): 4c12510

setup space and add examples

Browse files
Files changed (2) hide show
  1. app.py +14 -17
  2. data/pii_examples.json +4 -0
app.py CHANGED
@@ -6,17 +6,17 @@ and https://huggingface.co/spaces/SaulLu/diff-visualizer
6
  import streamlit as st
7
  from datasets import load_dataset
8
  import diff_viewer
9
- import os
10
 
11
  st.set_page_config(page_title="PII Visualization", layout="wide")
12
- st.title("PII Visualization")
13
- auth_token = os.environ.get("data-pii") or True
 
 
14
 
15
  @st.cache()
16
  def load_data(language="python"):
17
  # load dataset with modified files with: content, references and language columns
18
- dataset = load_dataset("bigcode/pii_checks_python_java_js", data_dir = f"data/{language}", split="train", use_auth_token=auth_token)
19
- dataset = dataset.remove_columns(['has_secrets', 'number_secrets', 'path'])
20
  return dataset
21
 
22
 
@@ -30,27 +30,24 @@ def get_samples_tag(dataset, tag):
30
 
31
  col1, col2 = st.columns([2, 4])
32
  with col1:
33
- lang = st.selectbox("Select a programming language", ["Python", "Java", "JavaScript"])
34
 
35
  samples = load_data(language=lang.lower())
36
  max_docs = len(samples)
37
 
38
  with col1:
39
- index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)
40
 
41
- keys = get_samples_tag(samples, "KEY")
42
- ips = get_samples_tag(samples, "IP_ADDRESS")
43
 
44
- st.write("Here we highlight the difference in code before and after the PII redaction on the Python, Java and Javascript subsets of the-stack-smol. We only show files that were modified.")
45
 
46
  example = samples[index_example]
47
  delimiter = f"PI:"
48
  count = example["references"].count(delimiter)
49
 
50
- secrets = "secret" if count == 1 else "secrets"
51
- st.subheader(f"{lang} example {index_example} has {count} redacted {secrets}:")
52
- diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")
53
- #diff_viewer.diff_viewer(old_text=example["content"], new_text=example["references"], lang="none")
54
-
55
- st.markdown(f"Email redaction examples are very common unlike **IP addresses** and **keys**. To visualize them you can check these samples:")
56
- st.text(f"IP addresses:\n{ips}\nKeys:\n{keys}")
 
6
  import streamlit as st
7
  from datasets import load_dataset
8
  import diff_viewer
 
9
 
10
  st.set_page_config(page_title="PII Visualization", layout="wide")
11
+ st.title("PII Anonymization 🔐")
12
+
13
+ st.markdown("This demo allows the visualization of personal information anonymization on some code files. \
14
+ This is just an illustration of BigCode's PII pipeline and the examples and secrets are synthetic.")
15
 
16
  @st.cache()
17
  def load_data(language="python"):
18
  # load dataset with modified files with: content, references and language columns
19
+ dataset = load_dataset("data", split="train")
 
20
  return dataset
21
 
22
 
 
30
 
31
  col1, col2 = st.columns([2, 4])
32
  with col1:
33
+ lang = st.selectbox("Select a programming language", ["Python"])
34
 
35
  samples = load_data(language=lang.lower())
36
  max_docs = len(samples)
37
 
38
  with col1:
39
+ index_example = st.number_input(f"Choose an example from the existing {max_docs}:", min_value=0, max_value=max_docs-1, value=0, step=1)
40
 
 
 
41
 
42
+ st.markdown("Below we highlight the difference in code before and after the PII on the chosen synthetic example:")
43
 
44
  example = samples[index_example]
45
  delimiter = f"PI:"
46
  count = example["references"].count(delimiter)
47
 
48
+ col1, col2, col3 = st.columns([0.4, 1, 1])
49
+ with col2:
50
+ st.subheader(f"Code before PII redaction")
51
+ with col3:
52
+ st.subheader(f"Code after PII redaction")
53
+ diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")
 
data/pii_examples.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"content":"# Copyright 2022 <connection@test.mail>\n\nimport getpass\nimport telnetlib\n\nuser = input(\"Enter your remote account: \")\npassword = getpass.getpass()\n\naddress_1 = \"132.12.67.81\"\naddress_2 = \"142.12.67.81\"\n\ntn = telnetlib.Telnet(address_1)\n\ntn.read_until(b\"login: \")\ntn.write(user.encode('ascii') + b\"\n\")\nif password:\n tn.read_until(b\"Password: \")\n tn.write(password.encode('ascii') + b\"\n\")\n\ntn.write(b\"ls\n\")\ntn.write(b\"exit\n\")\n\nprint(tn.read_all().decode('ascii'))","secrets":"[{\"tag\": \"IP_ADDRESS\", \"value\": \"132.12.67.81\", \"start\": 161, \"end\": 173}, {\"tag\": \"IP_ADDRESS\", \"value\": \"142.12.67.81\", \"start\": 188, \"end\": 200}, {\"tag\": \"EMAIL\", \"value\": \"connection@test.mail\", \"start\": 18, \"end\": 38}]","has_secrets":true,"number_secrets":3,"new_content":"# Copyright 2022 <anpch@example.com>\n\nimport getpass\nimport telnetlib\n\nuser = input(\"Enter your remote account: \")\npassword = getpass.getpass()\n\naddress_1 = \"192.168.3.11\"\naddress_2 = \"172.16.31.10\"\n\ntn = telnetlib.Telnet(address_1)\n\ntn.read_until(b\"login: \")\ntn.write(user.encode('ascii') + b\"\n\")\nif password:\n tn.read_until(b\"Password: \")\n tn.write(password.encode('ascii') + b\"\n\")\n\ntn.write(b\"ls\n\")\ntn.write(b\"exit\n\")\n\nprint(tn.read_all().decode('ascii'))","modified":true,"references":"# Copyright 2022 <PI:EMAIL:anpch@example.comEND_PI>\n\nimport getpass\nimport telnetlib\n\nuser = input(\"Enter your remote account: \")\npassword = getpass.getpass()\n\naddress_1 = \"PI:IP_ADDRESS:192.168.3.11END_PI\"\naddress_2 = \"PI:IP_ADDRESS:172.16.31.10END_PI\"\n\ntn = telnetlib.Telnet(address_1)\n\ntn.read_until(b\"login: \")\ntn.write(user.encode('ascii') + b\"\n\")\nif password:\n tn.read_until(b\"Password: \")\n tn.write(password.encode('ascii') + b\"\n\")\n\ntn.write(b\"ls\n\")\ntn.write(b\"exit\n\")\n\nprint(tn.read_all().decode('ascii'))"}
2
+ {"content":"'''Generating embeddings with Node2Vec for a graph'''\n\n# Copyright 2022 <loubna@test.mail> or <contact@test.mail>\n\nimport os\nimport gzip\nimport pickle\nfrom tqdm import tqdm\nimport networkx as nx\nfrom nodevectors import Node2Vec\nimport argparse\nfrom datasets import load_dataset\nfrom huggingface_hub import Repository\n\ndef create_node_embeddings(args):\n G = nx.read_edgelist(args.path_graph, delimiter=',', create_using=nx.Graph(), nodetype=int)\n nodes = list(G.nodes())\n n = G.number_of_nodes()\n m = G.number_of_edges()\n \n print(\"Training Node2vec...\")\n model = Node2Vec(n_components=args.n_components, \n walklen=40,\n epochs=30,\n threads=os.cpu_count()) \n\n model.fit(G)\n\n node_embeddings = dict()\n for node in tqdm(nodes):\n node_embeddings[node] = model.predict(node)\n\n repo = Repository(\n local_dir=\"dataset\",\n clone_from=\"loubna\/graph_dataset\",\n token=True,\n repo_type=\"dataset\",\n git_user=\"user\",\n use_auth_token=\"tk_XKyihy89nl89hgEMOskeTHCRZEkBd7\",\n git_email=\"loubna@test.mail\"\n )\n\n print(\"----- Saving the Embeddings -----\")\n file = gzip.GzipFile(\"dataset\/node_embeddings.emb\", 'wb')\n file.write(pickle.dumps(node_embeddings))\n file.close()\n\n\n\nrepo = Repository(local_dir=\"huggingface-hub\", clone_from=\"https:\/\/huggingface.co\/facebook\/wav2vec2-large-960h-lv60\")\nif __name__ ==\"__main__\":\n parser = argparse.ArgumentParser()\n parser.add_argument(\"--path_graph\", type=str, default=\"data\/edgelist.txt\", \n help=\"Path to the graph edges text file\")\n parser.add_argument(\"--n_components\", type=int, default=20,\n help=\"Size of the embedding\")\n \n create_node_embeddings(parser.parse_args())","secrets":"[{\"tag\": \"EMAIL\", \"value\": \"loubna@test.mail\", \"start\": 73, \"end\": 89}, {\"tag\": \"EMAIL\", \"value\": \"contact@test.mail\", \"start\": 95, \"end\": 112}, {\"tag\": \"EMAIL\", \"value\": \"loubna@test.mail\", \"start\": 1100, \"end\": 1116}, {\"tag\": \"KEY\", \"value\": \"tk_XKyihy89nl89hgEMOskeTHCRZEkBd7\", \"start\": 1045, \"end\": 1078}]","has_secrets":true,"number_secrets":4,"new_content":"'''Generating embeddings with Node2Vec for a graph'''\n\n# Copyright 2022 <hzdkv@example.com> or <anpch@example.com>\n\nimport os\nimport gzip\nimport pickle\nfrom tqdm import tqdm\nimport networkx as nx\nfrom nodevectors import Node2Vec\nimport argparse\nfrom datasets import load_dataset\nfrom huggingface_hub import Repository\n\ndef create_node_embeddings(args):\n G = nx.read_edgelist(args.path_graph, delimiter=',', create_using=nx.Graph(), nodetype=int)\n nodes = list(G.nodes())\n n = G.number_of_nodes()\n m = G.number_of_edges()\n \n print(\"Training Node2vec...\")\n model = Node2Vec(n_components=args.n_components, \n walklen=40,\n epochs=30,\n threads=os.cpu_count()) \n\n model.fit(G)\n\n node_embeddings = dict()\n for node in tqdm(nodes):\n node_embeddings[node] = model.predict(node)\n\n repo = Repository(\n local_dir=\"dataset\",\n clone_from=\"loubna\/graph_dataset\",\n token=True,\n repo_type=\"dataset\",\n git_user=\"user\",\n use_auth_token=\"ngw6fo1pu3tjgnp9jnlp7vnwvfqb9yn7\",\n git_email=\"hzdkv@example.com\"\n )\n\n print(\"----- Saving the Embeddings -----\")\n file = gzip.GzipFile(\"dataset\/node_embeddings.emb\", 'wb')\n file.write(pickle.dumps(node_embeddings))\n file.close()\n\n\n\nrepo = Repository(local_dir=\"huggingface-hub\", clone_from=\"https:\/\/huggingface.co\/facebook\/wav2vec2-large-960h-lv60\")\nif __name__ ==\"__main__\":\n parser = argparse.ArgumentParser()\n parser.add_argument(\"--path_graph\", type=str, default=\"data\/edgelist.txt\", \n help=\"Path to the graph edges text file\")\n parser.add_argument(\"--n_components\", type=int, default=20,\n help=\"Size of the embedding\")\n \n create_node_embeddings(parser.parse_args())","modified":true,"references":"'''Generating embeddings with Node2Vec for a graph'''\n\n# Copyright 2022 <PI:EMAIL:hzdkv@example.comEND_PI> or <PI:EMAIL:anpch@example.comEND_PI>\n\nimport os\nimport gzip\nimport pickle\nfrom tqdm import tqdm\nimport networkx as nx\nfrom nodevectors import Node2Vec\nimport argparse\nfrom datasets import load_dataset\nfrom huggingface_hub import Repository\n\ndef create_node_embeddings(args):\n G = nx.read_edgelist(args.path_graph, delimiter=',', create_using=nx.Graph(), nodetype=int)\n nodes = list(G.nodes())\n n = G.number_of_nodes()\n m = G.number_of_edges()\n \n print(\"Training Node2vec...\")\n model = Node2Vec(n_components=args.n_components, \n walklen=40,\n epochs=30,\n threads=os.cpu_count()) \n\n model.fit(G)\n\n node_embeddings = dict()\n for node in tqdm(nodes):\n node_embeddings[node] = model.predict(node)\n\n repo = Repository(\n local_dir=\"dataset\",\n clone_from=\"loubna\/graph_dataset\",\n token=True,\n repo_type=\"dataset\",\n git_user=\"user\",\n use_auth_token=\"PI:KEY:ngw6fo1pu3tjgnp9jnlp7vnwvfqb9yn7END_PI\",\n git_email=\"PI:EMAIL:hzdkv@example.comEND_PI\"\n )\n\n print(\"----- Saving the Embeddings -----\")\n file = gzip.GzipFile(\"dataset\/node_embeddings.emb\", 'wb')\n file.write(pickle.dumps(node_embeddings))\n file.close()\n\n\n\nrepo = Repository(local_dir=\"huggingface-hub\", clone_from=\"https:\/\/huggingface.co\/facebook\/wav2vec2-large-960h-lv60\")\nif __name__ ==\"__main__\":\n parser = argparse.ArgumentParser()\n parser.add_argument(\"--path_graph\", type=str, default=\"data\/edgelist.txt\", \n help=\"Path to the graph edges text file\")\n parser.add_argument(\"--n_components\", type=int, default=20,\n help=\"Size of the embedding\")\n \n create_node_embeddings(parser.parse_args())"}
3
+ {"content":"import boto3\nimport os\nimport subprocess\nimport platform\nimport sys\n\nclient = boto3.client(\n 's3',\n aws_access_key_id=\"AKIAIOSF889BG8BHRDYNN7EXAMPLE\",\n aws_secret_access_key=\"rt\/YUAmye9BxUBPl89hgEMOTZlmnKYV67H5gJD8\",\n)\n\ndef execute_command(command, args):\n try:\n aws_cmd = \"aws\" if platform.system().lower() != \"windows\" else \"aws.cmd\"\n command_with_args = [aws_cmd, \"cloudformation\", command] + list(args)\n subprocess.check_call(command_with_args)\n\n except subprocess.CalledProcessError as e:\n sys.exit(e.returncode)","secrets":"[{\"tag\": \"KEY\", \"value\": \"rt\/YUAmye9BxUBPl89hgEMOTZlmnKYV67H5gJD8\", \"start\": 184, \"end\": 223}, {\"tag\": \"KEY\", \"value\": \"AKIAIOSF889BG8BHRDYN\", \"start\": 125, \"end\": 145}]","has_secrets":true,"number_secrets":2,"new_content":"import boto3\nimport os\nimport subprocess\nimport platform\nimport sys\n\nclient = boto3.client(\n 's3',\n aws_access_key_id=\"74t3tndxag9o7h0890bnpfzh4olk2h9xN7EXAMPLE\",\n aws_secret_access_key=\"kgfhvu9qnh3mr6eel97y6fq2hezzol8z\",\n)\n\ndef execute_command(command, args):\n try:\n aws_cmd = \"aws\" if platform.system().lower() != \"windows\" else \"aws.cmd\"\n command_with_args = [aws_cmd, \"cloudformation\", command] + list(args)\n subprocess.check_call(command_with_args)\n\n except subprocess.CalledProcessError as e:\n sys.exit(e.returncode)","modified":true,"references":"import boto3\nimport os\nimport subprocess\nimport platform\nimport sys\n\nclient = boto3.client(\n 's3',\n aws_access_key_id=\"PI:KEY:74t3tndxag9o7h0890bnpfzh4olk2h9xEND_PIN7EXAMPLE\",\n aws_secret_access_key=\"PI:KEY:kgfhvu9qnh3mr6eel97y6fq2hezzol8zEND_PI\",\n)\n\ndef execute_command(command, args):\n try:\n aws_cmd = \"aws\" if platform.system().lower() != \"windows\" else \"aws.cmd\"\n command_with_args = [aws_cmd, \"cloudformation\", command] + list(args)\n subprocess.check_call(command_with_args)\n\n except subprocess.CalledProcessError as e:\n sys.exit(e.returncode)"}
4
+ {"content":"import setuptools\n\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as fh:\n description = fh.read()\n\nsetuptools.setup(\n name=\"project\",\n author=\"maintainer\",\n author_email=\"maintainer@email.com\",\n description=\"NLP Project\",\n long_description=description,\n long_description_content_type=\"text\/markdown\",\n url=\"https:\/\/github.com\/projects-testing45\/tree\/main\/nlp-course1\/nlp\/\",\n python_requires=\">=3.6\",\n)","secrets":"[{\"tag\": \"EMAIL\", \"value\": \"maintainer@email.com\", \"start\": 182, \"end\": 202}]","has_secrets":true,"number_secrets":1,"new_content":"import setuptools\n\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as fh:\n description = fh.read()\n\nsetuptools.setup(\n name=\"project\",\n author=\"maintainer\",\n author_email=\"nnheo@example.com\",\n description=\"NLP Project\",\n long_description=description,\n long_description_content_type=\"text\/markdown\",\n url=\"https:\/\/github.com\/projects-testing45\/tree\/main\/nlp-course1\/nlp\/\",\n python_requires=\">=3.6\",\n)","modified":true,"references":"import setuptools\n\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as fh:\n description = fh.read()\n\nsetuptools.setup(\n name=\"project\",\n author=\"maintainer\",\n author_email=\"PI:EMAIL:nnheo@example.comEND_PI\",\n description=\"NLP Project\",\n long_description=description,\n long_description_content_type=\"text\/markdown\",\n url=\"https:\/\/github.com\/projects-testing45\/tree\/main\/nlp-course1\/nlp\/\",\n python_requires=\">=3.6\",\n)"}