rbiswasfc commited on
Commit
36da3fb
1 Parent(s): 276eeb6

initial version

Browse files
Files changed (3) hide show
  1. app.py +119 -0
  2. data/.keep +0 -0
  3. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+
5
+ import gradio as gr
6
+ import spacy
7
+ from huggingface_hub import snapshot_download
8
+ from spacy import displacy
9
+ from spacy.tokens import Span
10
+
11
+ # download spacy model ---
12
+ os.system('python -m spacy download en_core_web_sm')
13
+
14
+ # # set up colors for PII types ---
15
+ options = {
16
+ "colors": {
17
+ "NAME_STUDENT": "#7FDBFF", # Soft blue
18
+ "EMAIL": "#008080", # Dark cyan
19
+ "USERNAME": "#C3B1E1", # Pastel violet
20
+ "ID_NUM": "#2ECC40", # Medium green
21
+ "PHONE_NUM": "#FF851B", # Deep orange
22
+ "URL_PERSONAL": "#4682B4", # Steel blue
23
+ "STREET_ADDRESS": "#808000", # Muted olive
24
+ }
25
+ }
26
+
27
+
28
+ # download datamix ---
29
+
30
+
31
+ def download_data():
32
+
33
+ snapshot_download(
34
+ repo_id="rbiswasfc/pii_datamix",
35
+ repo_type="dataset",
36
+ local_dir="./data",
37
+ )
38
+ print("Data downloaded!")
39
+
40
+
41
+ download_data()
42
+
43
+ # load data ---
44
+ with open("./data/datamix.json") as f:
45
+ data = json.load(f)
46
+
47
+ subsets = list(data.keys())
48
+ pii_types = list(options["colors"].keys())
49
+ pii_types.append("Random")
50
+
51
+ nlp = spacy.load("en_core_web_sm")
52
+ # render sample ---
53
+
54
+
55
+ def render_sample(subset, pii_type):
56
+ candidates = data[subset]
57
+ while True:
58
+ sample = random.choice(candidates)
59
+ if pii_type == "Random":
60
+ break
61
+ elif pii_type in sample['piis']:
62
+ break
63
+
64
+ # render
65
+ doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])
66
+
67
+ #
68
+ ents = []
69
+ in_entity = False
70
+ start, end = 0, 0
71
+
72
+ for index, label in enumerate(sample['labels']):
73
+ if label.startswith('B-'):
74
+ if in_entity: # End the previous entity
75
+ ents.append(Span(doc, start, end, label[2:]))
76
+ start, end = index, index + 1 # Start a new entity
77
+ in_entity = True
78
+ elif label.startswith('I-') and in_entity:
79
+ end = index + 1 # Continue the entity
80
+ elif in_entity:
81
+ # End the current entity and reset
82
+ ents.append(Span(doc, start, end, sample['labels'][start][2:]))
83
+ in_entity = False
84
+
85
+ doc.ents = ents
86
+ output = displacy.render(doc, style="ent", jupyter=False, options=options)
87
+ return output
88
+
89
+
90
+ # app layout & callback ---
91
+ with gr.Blocks(theme=gr.themes.Default(primary_hue=gr.themes.colors.red, secondary_hue=gr.themes.colors.pink)) as demo:
92
+ with gr.Row():
93
+ subset_dropdown = gr.Dropdown(
94
+ subsets,
95
+ value=subsets[0],
96
+ label="Subset",
97
+ info="Select data subset..."
98
+ )
99
+
100
+ focus_pii = gr.Dropdown(
101
+ pii_types,
102
+ value="Random",
103
+ label="PII Focus",
104
+ info="Select a PII type to focus on..."
105
+ )
106
+
107
+ sample_btn = gr.Button("Sample")
108
+
109
+ sample_display = gr.HTML(label="Example")
110
+
111
+ # callback ---
112
+ sample_btn.click(
113
+ fn=render_sample,
114
+ inputs=[subset_dropdown, focus_pii],
115
+ outputs=sample_display,
116
+ )
117
+
118
+ # launch app ---
119
+ demo.launch()
data/.keep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ spacy