rbiswasfc's picture
Update app.py
d966e51 verified
import json
import os
import random
import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span
# download spacy model --
os.system('python -m spacy download en_core_web_sm')
# # set up colors for PII types ---
# options = {
# "colors": {
# "NAME_STUDENT": "#7FDBFF", # Soft blue
# "EMAIL": "#008080", # Dark cyan
# "USERNAME": "#C3B1E1", # Pastel violet
# "ID_NUM": "#2ECC40", # Medium green
# "PHONE_NUM": "#FF851B", # Deep orange
# "URL_PERSONAL": "#4682B4", # Steel blue
# "STREET_ADDRESS": "#808000", # Muted olive
# }
# }
options = {
"colors": {
"NAME_STUDENT": "#6EB5FF", # Lighter blue
"EMAIL": "#42D4B5", # Light teal
"USERNAME": "#D8B4E2", # Light lavender
"ID_NUM": "#7AE88F", # Light green
"PHONE_NUM": "#FFB87D", # Light peach
"URL_PERSONAL": "#C9B4E2", # Pale purple
"STREET_ADDRESS": "#B4B77F" # Light olive
}
}
# download datamix ---
def download_data():
snapshot_download(
repo_id="rbiswasfc/pii-datamix",
repo_type="dataset",
local_dir="./data",
)
print("Data downloaded!")
download_data()
# load data ---
with open("./data/datamix.json") as f:
data = json.load(f)
subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")
nlp = spacy.load("en_core_web_sm")
# render sample --
def render_sample(subset, pii_type):
candidates = data[subset]
while True:
sample = random.choice(candidates)
if pii_type == "Random":
break
elif pii_type in sample['piis']:
break
print("---" * 10)
print(sample['document'])
print("---" * 10)
# render
doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])
#
ents = []
in_entity = False
start, end = 0, 0
for index, label in enumerate(sample['labels']):
if label.startswith('B-'):
if in_entity: # End the previous entity
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
start, end = index, index + 1 # Start a new entity
in_entity = True
elif label.startswith('I-') and in_entity:
end = index + 1 # Continue the entity
elif in_entity:
# End the current entity and reset
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
in_entity = False
# Add the last entity if we're still in one
if in_entity:
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
doc.ents = ents
output = displacy.render(doc, style="ent", jupyter=False, options=options)
return {'document': sample['document']}, output
# app layout & callback ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Blocks() as demo:
with gr.Row():
subset_dropdown = gr.Dropdown(
subsets,
value=subsets[0],
label="Subset",
info="Select data subset..."
)
focus_pii = gr.Dropdown(
pii_types,
value="Random",
label="PII Focus",
info="Select a PII type to focus on..."
)
sample_btn = gr.Button("Sample")
document_id_display = gr.JSON(label="Document ID")
sample_display = gr.HTML(label="Example")
# callback ---
sample_btn.click(
fn=render_sample,
inputs=[subset_dropdown, focus_pii],
outputs=[document_id_display, sample_display],
)
# launch app ---
demo.launch()