|
import json |
|
import os |
|
import random |
|
|
|
import gradio as gr |
|
import spacy |
|
from huggingface_hub import snapshot_download |
|
from spacy import displacy |
|
from spacy.tokens import Span |
|
|
|
|
|
os.system('python -m spacy download en_core_web_sm') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
options = { |
|
"colors": { |
|
"NAME_STUDENT": "#6EB5FF", |
|
"EMAIL": "#42D4B5", |
|
"USERNAME": "#D8B4E2", |
|
"ID_NUM": "#7AE88F", |
|
"PHONE_NUM": "#FFB87D", |
|
"URL_PERSONAL": "#C9B4E2", |
|
"STREET_ADDRESS": "#B4B77F" |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
def download_data(): |
|
|
|
snapshot_download( |
|
repo_id="rbiswasfc/pii-datamix", |
|
repo_type="dataset", |
|
local_dir="./data", |
|
) |
|
print("Data downloaded!") |
|
|
|
|
|
download_data() |
|
|
|
|
|
with open("./data/datamix.json") as f: |
|
data = json.load(f) |
|
|
|
subsets = list(data.keys()) |
|
pii_types = list(options["colors"].keys()) |
|
pii_types.append("Random") |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
def render_sample(subset, pii_type): |
|
candidates = data[subset] |
|
while True: |
|
sample = random.choice(candidates) |
|
if pii_type == "Random": |
|
break |
|
elif pii_type in sample['piis']: |
|
break |
|
|
|
print("---" * 10) |
|
print(sample['document']) |
|
print("---" * 10) |
|
|
|
doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace']) |
|
|
|
|
|
ents = [] |
|
in_entity = False |
|
start, end = 0, 0 |
|
|
|
for index, label in enumerate(sample['labels']): |
|
if label.startswith('B-'): |
|
if in_entity: |
|
ents.append(Span(doc, start, end, sample['labels'][start][2:])) |
|
start, end = index, index + 1 |
|
in_entity = True |
|
elif label.startswith('I-') and in_entity: |
|
end = index + 1 |
|
elif in_entity: |
|
|
|
ents.append(Span(doc, start, end, sample['labels'][start][2:])) |
|
in_entity = False |
|
|
|
|
|
if in_entity: |
|
ents.append(Span(doc, start, end, sample['labels'][start][2:])) |
|
|
|
doc.ents = ents |
|
output = displacy.render(doc, style="ent", jupyter=False, options=options) |
|
return {'document': sample['document']}, output |
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
subset_dropdown = gr.Dropdown( |
|
subsets, |
|
value=subsets[0], |
|
label="Subset", |
|
info="Select data subset..." |
|
) |
|
|
|
focus_pii = gr.Dropdown( |
|
pii_types, |
|
value="Random", |
|
label="PII Focus", |
|
info="Select a PII type to focus on..." |
|
) |
|
|
|
sample_btn = gr.Button("Sample") |
|
document_id_display = gr.JSON(label="Document ID") |
|
|
|
sample_display = gr.HTML(label="Example") |
|
|
|
|
|
sample_btn.click( |
|
fn=render_sample, |
|
inputs=[subset_dropdown, focus_pii], |
|
outputs=[document_id_display, sample_display], |
|
) |
|
|
|
|
|
demo.launch() |
|
|