MikeG27 commited on
Commit
3d15423
·
verified ·
1 Parent(s): cdbd070

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +5 -6
  2. app.py +474 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: Bert Pii
3
- emoji: 📉
4
- colorFrom: purple
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.46.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Gravitee PII
3
+ emoji: 💻
4
+ colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E501, INP001, FBT001
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, List, Tuple
6
+
7
+ import gradio as gr
8
+ import torch
9
+ from optimum.onnxruntime import ORTModelForTokenClassification
10
+ from transformers import AutoTokenizer
11
+
12
+ # Hugging Face model
13
+ MODEL_NAME = "gravitee-io/bert-small-pii-detection"
14
+
15
+ def load_model() -> Tuple[ORTModelForTokenClassification, AutoTokenizer]:
16
+ """Load BERT ONNX model and tokenizer from Hugging Face"""
17
+ import os
18
+
19
+ try:
20
+ # Load tokenizer from Hugging Face
21
+ tokenizer = AutoTokenizer.from_pretrained(
22
+ MODEL_NAME,
23
+ token=os.getenv("HUGGINGFACE_TOKEN")
24
+ )
25
+
26
+ # Try to load quantized model first, fallback to regular model
27
+ try:
28
+ model = ORTModelForTokenClassification.from_pretrained(
29
+ MODEL_NAME,
30
+ file_name="model.quant.onnx",
31
+ token=os.getenv("HUGGINGFACE_TOKEN")
32
+ )
33
+ except:
34
+ model = ORTModelForTokenClassification.from_pretrained(
35
+ MODEL_NAME,
36
+ file_name="model.onnx",
37
+ token=os.getenv("HUGGINGFACE_TOKEN")
38
+ )
39
+
40
+ return model, tokenizer
41
+ except Exception as e:
42
+ raise ValueError(f"Could not load model {MODEL_NAME}: {e}")
43
+
44
+ def convert_predictions_to_spans(predictions: List[int], offset_mapping: List[Tuple[int, int]], id2label: Dict[int, str], text: str) -> List[Dict]:
45
+ """Convert token-level predictions to entity spans using BIO tagging"""
46
+ spans = []
47
+ current_entity = None
48
+
49
+ for i, (pred, (start, end)) in enumerate(zip(predictions, offset_mapping)):
50
+ if start == end == 0: # Skip special tokens
51
+ continue
52
+
53
+ label = id2label[pred]
54
+
55
+ if label.startswith("B-"):
56
+ # Begin new entity
57
+ if current_entity:
58
+ spans.append(current_entity)
59
+ current_entity = {
60
+ "start": start,
61
+ "end": end,
62
+ "label": label[2:].lower(),
63
+ "text": text[start:end]
64
+ }
65
+ elif label.startswith("I-") and current_entity and label[2:].lower() == current_entity["label"]:
66
+ # Continue current entity
67
+ current_entity["end"] = end
68
+ current_entity["text"] = text[current_entity["start"]:end]
69
+ elif label == "O":
70
+ # Outside any entity
71
+ if current_entity:
72
+ spans.append(current_entity)
73
+ current_entity = None
74
+
75
+ # Don't forget the last entity
76
+ if current_entity:
77
+ spans.append(current_entity)
78
+
79
+ return spans
80
+
81
+ # Load model during initialization
82
+ print("Loading model from Hugging Face...")
83
+ _model, _tokenizer = load_model()
84
+ print(f"Model {MODEL_NAME} loaded successfully!")
85
+
86
+ def get_model_info():
87
+ """Get model and tokenizer (already loaded)"""
88
+ return _model, _tokenizer
89
+
90
+ def predict_entities(text: str, threshold: float) -> Dict:
91
+ """Predict entities using BERT ONNX model"""
92
+ try:
93
+ model, tokenizer = get_model_info()
94
+
95
+ # Tokenize input text
96
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True,
97
+ return_offsets_mapping=True, max_length=512)
98
+
99
+ offset_mapping = inputs.pop("offset_mapping")[0].tolist()
100
+
101
+ # Run inference
102
+ with torch.no_grad():
103
+ outputs = model(**inputs)
104
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
105
+ predicted_class_ids = torch.argmax(predictions, dim=-1)[0].tolist()
106
+ prediction_scores = torch.max(predictions, dim=-1)[0][0].tolist()
107
+
108
+ # Filter by threshold
109
+ filtered_predictions = []
110
+ filtered_offsets = []
111
+ for pred, score, offset in zip(predicted_class_ids, prediction_scores, offset_mapping):
112
+ if score >= threshold:
113
+ filtered_predictions.append(pred)
114
+ filtered_offsets.append(offset)
115
+ else:
116
+ filtered_predictions.append(0) # O tag
117
+ filtered_offsets.append(offset)
118
+
119
+ # Convert to spans
120
+ id2label = model.config.id2label
121
+ spans = convert_predictions_to_spans(filtered_predictions, filtered_offsets, id2label, text)
122
+
123
+ # Convert to gradio format
124
+ entities = []
125
+ for span in spans:
126
+ entities.append({
127
+ "entity": span["label"],
128
+ "word": span["text"],
129
+ "start": span["start"],
130
+ "end": span["end"],
131
+ "score": 1.0 # We already filtered by threshold
132
+ })
133
+
134
+ return {
135
+ "text": text,
136
+ "entities": entities
137
+ }
138
+
139
+ except Exception as e:
140
+ return {
141
+ "text": text,
142
+ "entities": [],
143
+ "error": str(e)
144
+ }
145
+
146
+ def format_text(text: str, format_type: str) -> str:
147
+ """Format text with proper spacing and indentation"""
148
+ if format_type == "None":
149
+ return text
150
+ elif format_type == "JSON":
151
+ try:
152
+ import json
153
+ # Try to parse and format as JSON
154
+ parsed = json.loads(text)
155
+ return json.dumps(parsed, indent=2)
156
+ except:
157
+ return text
158
+ elif format_type == "XML":
159
+ try:
160
+ import xml.etree.ElementTree as ET
161
+ from xml.dom import minidom
162
+
163
+ # Remove b' prefix if present
164
+ clean_text = text
165
+ if text.startswith("b'") and text.endswith("'"):
166
+ clean_text = text[2:-1]
167
+
168
+ # Parse and format XML
169
+ root = ET.fromstring(clean_text)
170
+ rough_string = ET.tostring(root, 'unicode')
171
+ reparsed = minidom.parseString(rough_string)
172
+ return reparsed.toprettyxml(indent=" ")
173
+ except:
174
+ return text
175
+ elif format_type == "HTML":
176
+ try:
177
+ from bs4 import BeautifulSoup
178
+ soup = BeautifulSoup(text, 'html.parser')
179
+ return soup.prettify()
180
+ except:
181
+ # Fallback: simple HTML formatting
182
+ formatted = text.replace('><', '>\n<')
183
+ formatted = formatted.replace('<tr>', '\n <tr>')
184
+ formatted = formatted.replace('<td>', '\n <td>')
185
+ formatted = formatted.replace('<th>', '\n <th>')
186
+ return formatted
187
+ elif format_type == "SQL":
188
+ # Simple SQL formatting
189
+ formatted = text.upper()
190
+ formatted = formatted.replace(' FROM ', '\nFROM ')
191
+ formatted = formatted.replace(' WHERE ', '\nWHERE ')
192
+ formatted = formatted.replace(' AND ', '\n AND ')
193
+ formatted = formatted.replace(' OR ', '\n OR ')
194
+ formatted = formatted.replace(' ORDER BY ', '\nORDER BY ')
195
+ formatted = formatted.replace(' GROUP BY ', '\nGROUP BY ')
196
+ formatted = formatted.replace(' HAVING ', '\nHAVING ')
197
+ formatted = formatted.replace(' LIMIT ', '\nLIMIT ')
198
+ return formatted
199
+ else:
200
+ return text
201
+
202
+ def ner(text: str, threshold: float, data_type: str = None, format_input: bool = False) -> List[Tuple[str, str]]:
203
+ """Main NER function for Gradio interface"""
204
+ # Format text if requested
205
+ if format_input and data_type and data_type != "Natural Text":
206
+ formatted_text = format_text(text, data_type)
207
+ result = predict_entities(formatted_text, threshold)
208
+ display_text = formatted_text
209
+ else:
210
+ result = predict_entities(text, threshold)
211
+ display_text = text
212
+
213
+ if "error" in result:
214
+ return [(display_text, None)]
215
+
216
+ # Convert to highlighted text format
217
+ highlighted = []
218
+ last_end = 0
219
+
220
+ for entity in sorted(result["entities"], key=lambda x: x["start"]):
221
+ # Add text before entity
222
+ if entity["start"] > last_end:
223
+ highlighted.append((display_text[last_end:entity["start"]], None))
224
+
225
+ # Add entity
226
+ highlighted.append((entity["word"], entity["entity"].upper()))
227
+ last_end = entity["end"]
228
+
229
+ # Add remaining text
230
+ if last_end < len(display_text):
231
+ highlighted.append((display_text[last_end:], None))
232
+
233
+ return highlighted
234
+
235
+ # Example texts - longer, more complex samples starting with Mixed PII
236
+ examples = [
237
+ # Natural Text examples (longer, more comprehensive)
238
+ [
239
+ "Dr. Sarah Martinez, age 34, works as a Senior Data Scientist at TechCorp International. Her employee ID is TC-DS-5591 and she joined the company on 2019-03-15. Sarah lives at 1247 Oak Avenue, Apartment 5B, Portland, Oregon 97205. Her work phone is 503-555-0147 and personal email is sarah.martinez@personalmail.com. For banking, she uses account TCBK89012345678901 at First National Bank. Her driver's license number is OR-DL-M8829134 and her social security number is 123-45-6789. She recently traveled to London using passport US-P-543216789 and her frequent flyer number with Delta Airlines is DL987654321.",
240
+ 0.35,
241
+ "Natural Text"
242
+ ],
243
+ [
244
+ "The customer database contains the following entries: Michael Chen (DOB: 1985-07-22, age 38) residing at 789 Pine Street, Suite 200, San Francisco, CA 94102. His contact details include phone 415-555-0298 and email michael.chen@businessmail.org. Financial information: Chase Bank account CH-5567889012345678, credit card 4532-1234-5678-9012 (exp: 08/2027, CVV: 451). Professional details: Software Engineer at InnovateTech LLC, employee ID IT-SE-7793, salary $125,000. Government IDs include SSN 987-65-4321, California driver's license CA-DL-B1234567, and passport number US-578912345. His device MAC address is aa:bb:cc:dd:ee:ff and IMEI 358240051111110.",
245
+ 0.35,
246
+ "Natural Text"
247
+ ],
248
+ [
249
+ "Security incident report for Lisa Thompson (ID: LT-2023-001): On 2023-11-15 at 14:30 PST, user accessed system from IP address 192.168.1.100 using API key api_key_abc123xyz789. Employee details: Lisa Thompson, age 29, title Senior Security Analyst, department Cybersecurity, hired 2021-09-01. Home address: 456 Maple Drive, Unit 3C, Seattle, WA 98109. Contact: phone 206-555-0189, work email lisa.thompson@company.com. Banking: Wells Fargo account WF-4455667788990011, routing number 021000021. Government IDs: SSN 555-44-3333, WA driver's license WA-DL-THOMP567, passport US-890123456. Vehicle: 2020 Honda Civic, license plate WA-ABC1234, VIN 1HGBH41JXMN109186.",
250
+ 0.35,
251
+ "Natural Text"
252
+ ],
253
+ [
254
+ "Patient intake form: Dr. Robert Kim (Medical License: MD-12345-WA), age 42, practices at Seattle General Hospital, 1500 Medical Center Drive, Seattle, WA 98101. Phone: 206-555-0234, fax: 206-555-0235, email: dr.kim@seattlegeneral.org. Patient information: Jennifer Walsh, DOB 1990-12-03 (age 33), SSN 111-22-3333, address 2100 Broadway Ave, Apt 15D, Seattle, WA 98122. Insurance: Blue Cross Blue Shield, policy BC-556677889900, group 12345. Emergency contact: Mark Walsh (spouse), phone 206-555-0167. Medical history includes prescription for Medication XYZ, DEA number DR1234567. Appointment scheduled for 2024-01-20 at 10:00 AM, confirmation code CONF-789456.",
255
+ 0.35,
256
+ "Natural Text"
257
+ ],
258
+ # HTML samples (longer, more complex)
259
+ [
260
+ '<table border=\"1\"><tr><th>api_key</th><td>PmtrSlgEzO PmtrSlgEzO br</td></tr><tr><th>page</th><td>73595</td></tr><tr><th>max_primary_general_date</th><td>1992-09-22</td></tr><tr><th>sort</th><td>RqJu PZwhjrbcS</td></tr><tr><th>election_type_id</th><td>PFTZDOBxIl</td></tr><tr><th>election_district</th><td>XNc7rk</td></tr><tr><th>max_election_date</th><td>2007-02-15</td></tr><tr><th>sort_null_only</th><td>False</td></tr><tr><th>min_election_date</th><td>2014-06-27</td></tr><tr><th>per_page</th><td>62971536</td></tr><tr><th>min_primary_general_date</th><td>1982-03-22</td></tr><tr><th>election_state</th><td>xzJis</td></tr><tr><th>election_party</th><td>lHUet 1vtAg5J lHUet</td></tr><tr><th>min_update_date</th><td>1984-07-25</td></tr><tr><th>sort_nulls_last</th><td>False</td></tr><tr><th>max_create_date</th><td>1980-01-02</td></tr><tr><th>max_update_date</th><td>1997-11-10</td></tr><tr><th>sort_hide_null</th><td>True</td></tr><tr><th>election_year</th><td>hNf2nYGMbX</td></tr><tr><th>min_create_date</th><td>2000-11-25</td></tr></table>',
261
+ 0.35,
262
+ "HTML"
263
+ ],
264
+ [
265
+ '<table border=\"1\"><tr><th>religion</th><td>Christianity</td></tr><tr><th>api-version</th><td>dCwMNqR</td></tr><tr><th>to_contact</th><td>VirginiaTBarrett@fleckens.hu</td></tr><tr><th>spot</th><td>6765 2278 Norma Avenue Mcbee , SC 33987</td></tr><tr><th>endTime</th><td>2022-09-07 14:17:30</td></tr><tr><th>startTime</th><td>2001-09-20 20:45:43</td></tr><tr><th>facility</th><td>Apt. 074</td></tr><tr><th>vocation</th><td>Lay-out worker</td></tr><tr><th>alley</th><td>1697 2496 White Pine Lane Apt. 904</td></tr></table>',
266
+ 0.35,
267
+ "HTML"
268
+ ],
269
+ [
270
+ '<table border=\"1\"><tr><th>imei</th><td>25-894407-891989-9</td></tr><tr><th>post-code</th><td>2142</td></tr><tr><th>startTime</th><td>2001-06-20 10:16:33</td></tr><tr><th>timeGrain</th><td></td></tr><tr><th>longitude</th><td>-70.990988</td></tr><tr><th>latitude</th><td>42.32382</td></tr><tr><th>endTime</th><td>1971-08-20 19:09:13</td></tr><tr><th>api-version</th><td>u zNS zNS</td></tr><tr><th>key store password</th><td>teiy1oD5ie</td></tr><tr><th>bank account</th><td>FILW85959012098599</td></tr></table>',
271
+ 0.35,
272
+ "HTML"
273
+ ],
274
+ [
275
+ '<table border=\"1\"><tr><th>country</th><td>United States</td></tr><tr><th>address</th><td>0133 2669 Locust Street Suite 601 Fort Gaines United States</td></tr><tr><th>project</th><td></td></tr><tr><th>nation_plural</th><td>vietnameses</td></tr><tr><th>urban__area</th><td>Buena Park</td></tr><tr><th>region</th><td>California</td></tr><tr><th>street</th><td>01474 3910 Melody Lane Apt. 383</td></tr><tr><th>phone-country-code</th><td>US</td></tr><tr><th>spot</th><td>Apt. 554</td></tr></table>',
276
+ 0.35,
277
+ "HTML"
278
+ ],
279
+ # JSON samples (longer, more complex)
280
+ [
281
+ '{\"api_key\": \"9ewl5\", \"page\": \"82\", \"max_primary_general_date\": \"1998-02-01\", \"sort\": \"nz siw\", \"election_type_id\": \"guerv jgwbunon guerv\", \"election_district\": \"03vpuute\", \"max_election_date\": \"1980-12-30\", \"sort_null_only\": \"false\", \"min_election_date\": \"2003-03-05\", \"per_page\": \"96\", \"min_primary_general_date\": \"1991-05-29\", \"election_state\": \"f9u4gfgt pzji\", \"election_party\": \"\", \"min_update_date\": \"1998-01-26\", \"sort_nulls_last\": \"false\", \"max_create_date\": \"1970-10-19\", \"office_sought\": \"rz1thr5zp\", \"max_update_date\": \"2018-12-12\", \"sort_hide_null\": \"true\", \"election_year\": \"alrcfqpswf\", \"min_create_date\": \"2003-02-18\"}',
282
+ 0.35,
283
+ "JSON"
284
+ ],
285
+ [
286
+ '{\"sort\": \"\", \"incumbent_challenge\": \"rQ a\", \"longitude\": \"-98.705515\", \"has_raised_funds\": \"True\", \"airport\": \"New Orleans International airport\", \"office\": \"\", \"candidate_status\": \"e\", \"district\": \"\", \"sort_nulls_last\": \"True\", \"per_page\": \"344387016\", \"state\": \"Texas\", \"location\": \"-89.030682\", \"airport_icao\": \"KOKC\", \"api_key\": \"\", \"origin airport code\": \"LIS\", \"year\": \"2012\", \"sort_hide_null\": \"False\", \"cycle\": \"VAnEFSGu LDiJQtw LDiJQtw\", \"lat\": \"33.182925\", \"sort_null_only\": \"False\", \"page\": \"5661254\", \"election_year\": \"\", \"federal_funds_flag\": \"False\", \"party\": \"\", \"name\": \"OSsUo\"}',
287
+ 0.35,
288
+ "JSON"
289
+ ],
290
+ [
291
+ '{\"nationality\": \"American\", \"keyStorePass\": \"LObizj\", \":operation\": \"XSnpUioywM iOF5gN1bHM\", \"currentPassword\": \"wo3vooch8Ie\", \"nation_plural\": \"north-americans\", \"alias\": \"aoJPk aoJPk\", \"prefix\": \"Mr.\", \"prefix_male\": \"Mr.\", \"newAlias\": \"\", \"nation_woman\": \"western samoan\", \"newPassword\": \"UVpvCQ UVpvCQ\", \"keyPassword\": \"k4GWWlP@@z\", \"nation_man\": \"bahraini\", \"rePassword\": \"\", \"removeAlias\": \"o\"}',
292
+ 0.35,
293
+ "JSON"
294
+ ],
295
+ [
296
+ '{\"imei\": \"27-051998-738345-4\", \"post-code\": \"28403\", \"startTime\": \"1996-04-20 02:21:52\", \"timeGrain\": \"0f8Jl9qmZ3 cJSVXOylw\", \"longitude\": \"-77.952502\", \"latitude\": \"34.258789\", \"endTime\": \"1994-08-17 13:38:00\", \"api-version\": \"HDjWC jcOLlPG8W\", \"key store password\": \"ahZeT2ee\", \"bank account\": \"KEKY41344355014443\"}',
297
+ 0.35,
298
+ "JSON"
299
+ ],
300
+ # SQL samples (longer, more complex)
301
+ [
302
+ 'SELECT \"endTime,startTime,age,nation_woman,national identity,arline name,airport_icao,coordinate,api-version\",\"api-version\",CASE WHEN \"endTime\" THEN \'skin\' WHEN \"startTime\"=\'1992-01-13 23:33:10\' THEN \'president\' WHEN \"age\"=\'31\' THEN \'be\' WHEN \"nation_woman\"=\'syrian\' THEN \'particular\' WHEN \"national identity\"<>\'600233955\' THEN \'trip\' WHEN \"arline name\"<>\'Shanghai Airlines\' THEN \'present\' WHEN \"airport_icao\"<>\'SBJP\' THEN \'forget\' WHEN \"coordinate\"=\'52.297060\' THEN \'car\' WHEN \"api-version\" THEN \'also\' END FROM \"not\" WHERE \"endTime\" AND \"startTime\"=\'1973-12-27 11:08:01\' AND (\"age\"=\'64\' OR \"age\"=\'answer\') AND \"nation_woman\"<>\'guyanese\' AND \"national identity\"<>\'142451774\' AND \"arline name\" AND \"airport_icao\" AND \"coordinate\"=\'46.828790\' AND (\"api-version\"=\'KOikhS KOikhS yz\' OR \"api-version\"=\'activity\') LIMIT 64',
303
+ 0.35,
304
+ "SQL"
305
+ ],
306
+ [
307
+ 'SELECT \"week__day,Version,Tags,age,currency_code,TargetBucket,expiration-date,TargetSnapshotName,swift-code,KmsKeyId,Action,debit card,SourceSnapshotName\",\"SourceSnapshotName\",CASE WHEN \"week__day\"=\'Saturday\' THEN \'serious\' WHEN \"Version\"=\'2015-02-02\' OR \"Version\"=\'staff\' THEN \'country\' WHEN \"Tags\"<>\'\' THEN \'water\' WHEN \"age\" THEN \'behind\' WHEN \"currency_code\"=\'CAD\' THEN \'position\' WHEN \"TargetBucket\" THEN \'next\' WHEN \"expiration-date\"=\'11/2023\' OR \"expiration-date\"=\'technology\' THEN \'kid\' WHEN \"TargetSnapshotName\"=\'pWJ\' OR \"TargetSnapshotName\"=\'give\' THEN \'child\' WHEN \"swift-code\"=\'GWIZGBQPBUW\' THEN \'poor\' WHEN \"KmsKeyId\" THEN \'meeting\' WHEN \"Action\"=\'CopySnapshot\' THEN \'collection\' WHEN \"debit card\"<>\'30381983513092\' THEN \'paper\' WHEN \"SourceSnapshotName\"=\'\' THEN \'keep\' END FROM \"statement\" WHERE \"week__day\"=\'Tuesday\' AND \"Version\"=\'2015-02-02\' AND \"Tags\"=\'\' AND \"age\"=\'20\' AND \"currency_code\"=\'MGA\' AND \"TargetBucket\"=\'\' AND \"expiration-date\"=\'02/24\' AND \"TargetSnapshotName\"=\'\' AND \"swift-code\"=\'GNCHGBZC\' AND \"KmsKeyId\"=\'\' AND \"Action\"=\'CopySnapshot\' AND \"debit card\"=\'4534384187682\' AND \"SourceSnapshotName\"=\'\' LIMIT 36',
308
+ 0.35,
309
+ "SQL"
310
+ ],
311
+ [
312
+ 'SELECT \"expiration-date,prettyPrint,alt,master-card,arline__name,key,bank city,fields,building,quotaUser,userIp,to country code,oauth_token\",\"oauth_token\",CASE WHEN \"expiration-date\"=\'3/2024\' THEN \'reduce\' WHEN \"prettyPrint\"=\'False\' OR \"prettyPrint\"=\'south\' THEN \'within\' WHEN \"alt\"<>\'json\' THEN \'thing\' WHEN \"master-card\" THEN \'strategy\' WHEN \"arline__name\"=\'Air India\' THEN \'forward\' WHEN \"key\" THEN \'artist\' WHEN \"bank city\"=\'Helena\' OR \"bank city\"=\'more\' THEN \'pay\' WHEN \"fields\"=\'\' OR \"fields\"=\'thing\' THEN \'rest\' WHEN \"building\"=\'977\' THEN \'executive\' WHEN \"quotaUser\" THEN \'safe\' WHEN \"userIp\"=\'pWJ\' THEN \'whom\' WHEN \"to country code\"<>\'US\' THEN \'not\' WHEN \"oauth_token\"=\'\' THEN \'choice\' END FROM \"wrong\" WHERE (\"expiration-date\"=\'05/23\' OR \"expiration-date\"=\'language\') AND \"prettyPrint\"=\'True\' AND \"alt\"<>\'json\' AND \"master-card\"=\'349245482859346\' AND \"arline__name\"=\'Indonesia AirAsia\' AND \"key\"=\'\' AND \"bank city\"=\'Georgetown\' AND \"fields\"=\'\' AND \"building\"=\'7241\' AND \"quotaUser\"=\'\' AND \"userIp\"=\'\' AND \"to country code\"=\'TM\' AND \"oauth_token\"=\'\' LIMIT 64',
313
+ 0.35,
314
+ "SQL"
315
+ ],
316
+ [
317
+ 'SELECT `schemaName,databaseName,city,building,coordinate,state_abbreviation,driver license,international__mobile__equipment__identity`,`international__mobile__equipment__identity`,CASE WHEN `schemaName`<>\'fX04 bHQKn bHQKn\' THEN \'far\' WHEN `databaseName` THEN \'college\' WHEN `city`=\'Orlando\' OR `city`=\'probably\' THEN \'boy\' WHEN `building`<>\'2672\' THEN \'wind\' WHEN `coordinate`=\'-21.907687\' THEN \'offer\' WHEN `state_abbreviation`=\'FL\' THEN \'its\' WHEN `driver license`=\'H872538367807\' THEN \'lose\' WHEN `international__mobile__equipment__identity`=\'42-161139-363377-6\' OR `international__mobile__equipment__identity`=\'attention\' THEN \'nor\' END FROM `business` WHERE (`schemaName`=\'BfgAeXWjbC BfgAeXWjbC\' OR `schemaName`=\'across\') AND `databaseName`<>\'hw w\' AND `city`=\'West Caroline\' AND `building`<>\'44030\' AND `coordinate`=\'-21.907687\' AND `state_abbreviation`=\'IA\' AND `driver license`=\'224242065\' AND `international__mobile__equipment__identity`=\'83-695777-883364-1\' LIMIT 10',
318
+ 0.35,
319
+ "SQL"
320
+ ],
321
+ # XML samples (longer, more complex)
322
+ [
323
+ 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><sort type=\"str\"></sort><incumbent_challenge type=\"str\"></incumbent_challenge><longitude type=\"str\">-97.518538</longitude><has_raised_funds type=\"str\">True</has_raised_funds><airport type=\"str\">John F Kennedy International airport</airport><office type=\"str\">IDuqbH m</office><candidate_status type=\"str\">qEw3Tpc wmYqRUtTH</candidate_status><district type=\"str\">D UCd6ZAFD D</district><sort_nulls_last type=\"str\">False</sort_nulls_last><per_page type=\"str\">7720</per_page><state type=\"str\">South Dakota</state><location type=\"str\">-109.575655</location><airport_icao type=\"str\">EDDH</airport_icao><api_key type=\"str\">46nCNe0 Wj Wj</api_key><origin_airport_code type=\"str\">DEN</origin_airport_code><year type=\"str\">1996</year><sort_hide_null type=\"str\">False</sort_hide_null><cycle type=\"str\">FNxL</cycle><lat type=\"str\">43.16524</lat><sort_null_only type=\"str\">False</sort_null_only><page type=\"str\">4894426</page><election_year type=\"str\"></election_year><federal_funds_flag type=\"str\">False</federal_funds_flag><party type=\"str\"></party><name type=\"str\">aKPjF</name></root>\'',
324
+ 0.35,
325
+ "XML"
326
+ ],
327
+ [
328
+ 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><api_key type=\"str\">E hMCQl hMCQl</api_key><page type=\"str\">984478</page><max_primary_general_date type=\"str\">2008-01-29</max_primary_general_date><sort type=\"str\"></sort><election_type_id type=\"str\">L85O2N</election_type_id><election_district type=\"str\">M</election_district><max_election_date type=\"str\">2017-08-07</max_election_date><sort_null_only type=\"str\">False</sort_null_only><min_election_date type=\"str\">2007-07-01</min_election_date><per_page type=\"str\">452141118</per_page><min_primary_general_date type=\"str\">1977-07-12</min_primary_general_date><election_state type=\"str\"></election_state><election_party type=\"str\">CH4 Ceq Ceq</election_party><min_update_date type=\"str\">1980-04-11</min_update_date><sort_nulls_last type=\"str\">False</sort_nulls_last><max_create_date type=\"str\">1997-04-23</max_create_date><max_update_date type=\"str\">2020-12-25</max_update_date><sort_hide_null type=\"str\">True</sort_hide_null><election_year type=\"str\">v0rF4t8</election_year><min_create_date type=\"str\">2013-11-30</min_create_date></root>\'',
329
+ 0.35,
330
+ "XML"
331
+ ],
332
+ [
333
+ 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><nationality type=\"str\">American</nationality><last_name_male type=\"str\">Hayden</last_name_male><NextToken type=\"str\">YX8Fh4d NiOugSJPwm NiOugSJPwm</NextToken><StartDate type=\"str\">2007-04-07</StartDate><EndDate type=\"str\">1971-05-28</EndDate><family-name-female type=\"str\">Weishaar</family-name-female><PageSize type=\"str\">19750435</PageSize><prefix_male type=\"str\">Mr.</prefix_male><given__name__female type=\"str\">Dara</given__name__female><nation_man type=\"str\">bulgarian</nation_man></root>\'',
334
+ 0.35,
335
+ "XML"
336
+ ],
337
+ [
338
+ 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><imei type=\"str\">30-696164-389965-5</imei><post-code type=\"str\">33179</post-code><startTime type=\"str\">2017-02-05 13:11:21</startTime><timeGrain type=\"str\">S</timeGrain><longitude type=\"str\">-80.270951</longitude><latitude type=\"str\">25.898545</latitude><endTime type=\"str\">1990-02-04 22:51:09</endTime><api-version type=\"str\">Ad Ad wM5NWqRt</api-version><key_store_password type=\"str\">Shohr3aep</key_store_password><bank_account type=\"str\">BZEV05211288606606</bank_account></root>\'',
339
+ 0.35,
340
+ "XML"
341
+ ],
342
+ ]
343
+
344
+ with gr.Blocks(title="Gravitee BERT PII") as demo:
345
+ gr.Markdown(
346
+ f"""
347
+ # Gravitee BERT PII (Personally Identifiable Information extraction)
348
+
349
+ This application uses the **{MODEL_NAME}** model for Named Entity Recognition (NER) to detect personally identifiable information.
350
+ The model uses token classification with BIO tagging to identify predefined entity types including names, addresses,
351
+ financial information, and more.
352
+ """
353
+ )
354
+
355
+ with gr.Accordion("Available Entity Types", open=False):
356
+ gr.Markdown(
357
+ """
358
+ The BERT models can detect the following entity types:
359
+
360
+ **Personal Information:**
361
+ - PERSON (names)
362
+ - AGE
363
+ - PHONE_NUMBER
364
+ - EMAIL_ADDRESS
365
+
366
+ **Location & Address:**
367
+ - LOCATION
368
+ - COORDINATE
369
+
370
+ **Financial:**
371
+ - CREDIT_CARD
372
+ - IBAN_CODE
373
+ - FINANCIAL
374
+ - US_BANK_NUMBER
375
+
376
+ **Government IDs:**
377
+ - US_SSN (Social Security Number)
378
+ - US_DRIVER_LICENSE
379
+ - US_PASSPORT
380
+ - US_ITIN
381
+ - US_LICENSE_PLATE
382
+ - NRP (National Registration Number)
383
+
384
+ **Technical:**
385
+ - IP_ADDRESS
386
+ - MAC_ADDRESS
387
+ - URL
388
+ - IMEI
389
+ - PASSWORD
390
+
391
+ **Other:**
392
+ - DATE_TIME
393
+ - ORGANIZATION
394
+ - TITLE
395
+ """
396
+ )
397
+
398
+ with gr.Accordion("How to run this model locally", open=False):
399
+ gr.Markdown(
400
+ """
401
+ ## Installation
402
+ To use this model, install the required dependencies:
403
+ ```
404
+ pip install transformers optimum[onnxruntime] torch
405
+ ```
406
+
407
+ ## Usage
408
+ Load the model using the Optimum library for ONNX Runtime:
409
+ ```python
410
+ from optimum.onnxruntime import ORTModelForTokenClassification
411
+ from transformers import AutoTokenizer
412
+
413
+ model_path = "gravitee-io/bert-small-vanilla-ner"
414
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
415
+ model = ORTModelForTokenClassification.from_pretrained(model_path, file_name="model.onnx")
416
+
417
+ text = "John Doe lives at 123 Main St and his email is john@example.com"
418
+ inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
419
+ outputs = model(**inputs)
420
+ ```
421
+ """
422
+ )
423
+
424
+ input_text = gr.Textbox(
425
+ value=examples[0][0],
426
+ label="Text input",
427
+ placeholder="Enter your text here"
428
+ )
429
+
430
+ with gr.Row():
431
+ threshold = gr.Slider(
432
+ 0,
433
+ 1,
434
+ value=0.35,
435
+ step=0.01,
436
+ label="Confidence Threshold",
437
+ info="Lower the threshold to get more predictions with lower confidence.",
438
+ scale=2
439
+ )
440
+
441
+ data_type_display = gr.Textbox(
442
+ value=examples[0][2],
443
+ label="Data Type",
444
+ interactive=False,
445
+ scale=1
446
+ )
447
+
448
+ format_checkbox = gr.Checkbox(
449
+ value=False,
450
+ label="Format Text",
451
+ info="Auto-format JSON, XML, HTML, SQL with proper indentation",
452
+ scale=1
453
+ )
454
+
455
+ output = gr.HighlightedText(label="Predicted Entities")
456
+ submit_btn = gr.Button("Submit")
457
+
458
+ examples_component = gr.Examples(
459
+ examples,
460
+ fn=ner,
461
+ inputs=[input_text, threshold, data_type_display, format_checkbox],
462
+ outputs=output,
463
+ cache_examples=False,
464
+ )
465
+
466
+ # Event handlers
467
+ input_text.submit(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output)
468
+ threshold.release(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output)
469
+ format_checkbox.change(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output)
470
+ submit_btn.click(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output)
471
+
472
+ if __name__ == "__main__":
473
+ demo.queue()
474
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers>=4.21.0
2
+ optimum[onnxruntime]>=1.12.0
3
+ gradio>=4.0.0
4
+ torch>=1.13.0
5
+ numpy>=1.21.0
6
+ beautifulsoup4>=4.9.0