MikeG27 commited on
Commit
0f799c0
·
verified ·
1 Parent(s): 4936ece

Update app.py

Browse files

Update examples order

Files changed (1) hide show
  1. app.py +46 -47
app.py CHANGED
@@ -202,7 +202,7 @@ def format_text(text: str, format_type: str) -> str:
202
  def ner(text: str, threshold: float, data_type: str = None, format_input: bool = False) -> List[Tuple[str, str]]:
203
  """Main NER function for Gradio interface"""
204
  # Format text if requested
205
- if format_input and data_type and data_type != "Plain Text":
206
  formatted_text = format_text(text, data_type)
207
  result = predict_entities(formatted_text, threshold)
208
  display_text = formatted_text
@@ -232,51 +232,8 @@ def ner(text: str, threshold: float, data_type: str = None, format_input: bool =
232
 
233
  return highlighted
234
 
235
- # Example texts - longer, more complex samples starting with Mixed PII
236
  examples = [
237
- # Natural Text examples (longer, more comprehensive)
238
- [
239
- "Dr. Sarah Martinez, age 34, works as a Senior Data Scientist at TechCorp International. Her employee ID is TC-DS-5591 and she joined the company on 2019-03-15. Sarah lives at 1247 Oak Avenue, Apartment 5B, Portland, Oregon 97205. Her work phone is 503-555-0147 and personal email is sarah.martinez@personalmail.com. For banking, she uses account TCBK89012345678901 at First National Bank. Her driver's license number is OR-DL-M8829134 and her social security number is 123-45-6789. She recently traveled to London using passport US-P-543216789 and her frequent flyer number with Delta Airlines is DL987654321.",
240
- 0.35,
241
- "Plain Text"
242
- ],
243
- [
244
- "The customer database contains the following entries: Michael Chen (DOB: 1985-07-22, age 38) residing at 789 Pine Street, Suite 200, San Francisco, CA 94102. His contact details include phone 415-555-0298 and email michael.chen@businessmail.org. Financial information: Chase Bank account CH-5567889012345678, credit card 4532-1234-5678-9012 (exp: 08/2027, CVV: 451). Professional details: Software Engineer at InnovateTech LLC, employee ID IT-SE-7793, salary $125,000. Government IDs include SSN 987-65-4321, California driver's license CA-DL-B1234567, and passport number US-578912345. His device MAC address is aa:bb:cc:dd:ee:ff and IMEI 358240051111110.",
245
- 0.35,
246
- "Plain Text"
247
- ],
248
- [
249
- "Security incident report for Lisa Thompson (ID: LT-2023-001): On 2023-11-15 at 14:30 PST, user accessed system from IP address 192.168.1.100 using API key api_key_abc123xyz789. Employee details: Lisa Thompson, age 29, title Senior Security Analyst, department Cybersecurity, hired 2021-09-01. Home address: 456 Maple Drive, Unit 3C, Seattle, WA 98109. Contact: phone 206-555-0189, work email lisa.thompson@company.com. Banking: Wells Fargo account WF-4455667788990011, routing number 021000021. Government IDs: SSN 555-44-3333, WA driver's license WA-DL-THOMP567, passport US-890123456. Vehicle: 2020 Honda Civic, license plate WA-ABC1234, VIN 1HGBH41JXMN109186.",
250
- 0.35,
251
- "Plain Text"
252
- ],
253
- [
254
- "Patient intake form: Dr. Robert Kim (Medical License: MD-12345-WA), age 42, practices at Seattle General Hospital, 1500 Medical Center Drive, Seattle, WA 98101. Phone: 206-555-0234, fax: 206-555-0235, email: dr.kim@seattlegeneral.org. Patient information: Jennifer Walsh, DOB 1990-12-03 (age 33), SSN 111-22-3333, address 2100 Broadway Ave, Apt 15D, Seattle, WA 98122. Insurance: Blue Cross Blue Shield, policy BC-556677889900, group 12345. Emergency contact: Mark Walsh (spouse), phone 206-555-0167. Medical history includes prescription for Medication XYZ, DEA number DR1234567. Appointment scheduled for 2024-01-20 at 10:00 AM, confirmation code CONF-789456.",
255
- 0.35,
256
- "Plain Text"
257
- ],
258
- # HTML samples (longer, more complex)
259
- [
260
- '<table border=\"1\"><tr><th>api_key</th><td>PmtrSlgEzO PmtrSlgEzO br</td></tr><tr><th>page</th><td>73595</td></tr><tr><th>max_primary_general_date</th><td>1992-09-22</td></tr><tr><th>sort</th><td>RqJu PZwhjrbcS</td></tr><tr><th>election_type_id</th><td>PFTZDOBxIl</td></tr><tr><th>election_district</th><td>XNc7rk</td></tr><tr><th>max_election_date</th><td>2007-02-15</td></tr><tr><th>sort_null_only</th><td>False</td></tr><tr><th>min_election_date</th><td>2014-06-27</td></tr><tr><th>per_page</th><td>62971536</td></tr><tr><th>min_primary_general_date</th><td>1982-03-22</td></tr><tr><th>election_state</th><td>xzJis</td></tr><tr><th>election_party</th><td>lHUet 1vtAg5J lHUet</td></tr><tr><th>min_update_date</th><td>1984-07-25</td></tr><tr><th>sort_nulls_last</th><td>False</td></tr><tr><th>max_create_date</th><td>1980-01-02</td></tr><tr><th>max_update_date</th><td>1997-11-10</td></tr><tr><th>sort_hide_null</th><td>True</td></tr><tr><th>election_year</th><td>hNf2nYGMbX</td></tr><tr><th>min_create_date</th><td>2000-11-25</td></tr></table>',
261
- 0.35,
262
- "HTML"
263
- ],
264
- [
265
- '<table border=\"1\"><tr><th>religion</th><td>Christianity</td></tr><tr><th>api-version</th><td>dCwMNqR</td></tr><tr><th>to_contact</th><td>VirginiaTBarrett@fleckens.hu</td></tr><tr><th>spot</th><td>6765 2278 Norma Avenue Mcbee , SC 33987</td></tr><tr><th>endTime</th><td>2022-09-07 14:17:30</td></tr><tr><th>startTime</th><td>2001-09-20 20:45:43</td></tr><tr><th>facility</th><td>Apt. 074</td></tr><tr><th>vocation</th><td>Lay-out worker</td></tr><tr><th>alley</th><td>1697 2496 White Pine Lane Apt. 904</td></tr></table>',
266
- 0.35,
267
- "HTML"
268
- ],
269
- [
270
- '<table border=\"1\"><tr><th>imei</th><td>25-894407-891989-9</td></tr><tr><th>post-code</th><td>2142</td></tr><tr><th>startTime</th><td>2001-06-20 10:16:33</td></tr><tr><th>timeGrain</th><td></td></tr><tr><th>longitude</th><td>-70.990988</td></tr><tr><th>latitude</th><td>42.32382</td></tr><tr><th>endTime</th><td>1971-08-20 19:09:13</td></tr><tr><th>api-version</th><td>u zNS zNS</td></tr><tr><th>key store password</th><td>teiy1oD5ie</td></tr><tr><th>bank account</th><td>FILW85959012098599</td></tr></table>',
271
- 0.35,
272
- "HTML"
273
- ],
274
- [
275
- '<table border=\"1\"><tr><th>country</th><td>United States</td></tr><tr><th>address</th><td>0133 2669 Locust Street Suite 601 Fort Gaines United States</td></tr><tr><th>project</th><td></td></tr><tr><th>nation_plural</th><td>vietnameses</td></tr><tr><th>urban__area</th><td>Buena Park</td></tr><tr><th>region</th><td>California</td></tr><tr><th>street</th><td>01474 3910 Melody Lane Apt. 383</td></tr><tr><th>phone-country-code</th><td>US</td></tr><tr><th>spot</th><td>Apt. 554</td></tr></table>',
276
- 0.35,
277
- "HTML"
278
- ],
279
- # JSON samples (longer, more complex)
280
  [
281
  '{\"api_key\": \"9ewl5\", \"page\": \"82\", \"max_primary_general_date\": \"1998-02-01\", \"sort\": \"nz siw\", \"election_type_id\": \"guerv jgwbunon guerv\", \"election_district\": \"03vpuute\", \"max_election_date\": \"1980-12-30\", \"sort_null_only\": \"false\", \"min_election_date\": \"2003-03-05\", \"per_page\": \"96\", \"min_primary_general_date\": \"1991-05-29\", \"election_state\": \"f9u4gfgt pzji\", \"election_party\": \"\", \"min_update_date\": \"1998-01-26\", \"sort_nulls_last\": \"false\", \"max_create_date\": \"1970-10-19\", \"office_sought\": \"rz1thr5zp\", \"max_update_date\": \"2018-12-12\", \"sort_hide_null\": \"true\", \"election_year\": \"alrcfqpswf\", \"min_create_date\": \"2003-02-18\"}',
282
  0.35,
@@ -297,7 +254,7 @@ examples = [
297
  0.35,
298
  "JSON"
299
  ],
300
- # SQL samples (longer, more complex)
301
  [
302
  'SELECT \"endTime,startTime,age,nation_woman,national identity,arline name,airport_icao,coordinate,api-version\",\"api-version\",CASE WHEN \"endTime\" THEN \'skin\' WHEN \"startTime\"=\'1992-01-13 23:33:10\' THEN \'president\' WHEN \"age\"=\'31\' THEN \'be\' WHEN \"nation_woman\"=\'syrian\' THEN \'particular\' WHEN \"national identity\"<>\'600233955\' THEN \'trip\' WHEN \"arline name\"<>\'Shanghai Airlines\' THEN \'present\' WHEN \"airport_icao\"<>\'SBJP\' THEN \'forget\' WHEN \"coordinate\"=\'52.297060\' THEN \'car\' WHEN \"api-version\" THEN \'also\' END FROM \"not\" WHERE \"endTime\" AND \"startTime\"=\'1973-12-27 11:08:01\' AND (\"age\"=\'64\' OR \"age\"=\'answer\') AND \"nation_woman\"<>\'guyanese\' AND \"national identity\"<>\'142451774\' AND \"arline name\" AND \"airport_icao\" AND \"coordinate\"=\'46.828790\' AND (\"api-version\"=\'KOikhS KOikhS yz\' OR \"api-version\"=\'activity\') LIMIT 64',
303
  0.35,
@@ -318,7 +275,7 @@ examples = [
318
  0.35,
319
  "SQL"
320
  ],
321
- # XML samples (longer, more complex)
322
  [
323
  'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><sort type=\"str\"></sort><incumbent_challenge type=\"str\"></incumbent_challenge><longitude type=\"str\">-97.518538</longitude><has_raised_funds type=\"str\">True</has_raised_funds><airport type=\"str\">John F Kennedy International airport</airport><office type=\"str\">IDuqbH m</office><candidate_status type=\"str\">qEw3Tpc wmYqRUtTH</candidate_status><district type=\"str\">D UCd6ZAFD D</district><sort_nulls_last type=\"str\">False</sort_nulls_last><per_page type=\"str\">7720</per_page><state type=\"str\">South Dakota</state><location type=\"str\">-109.575655</location><airport_icao type=\"str\">EDDH</airport_icao><api_key type=\"str\">46nCNe0 Wj Wj</api_key><origin_airport_code type=\"str\">DEN</origin_airport_code><year type=\"str\">1996</year><sort_hide_null type=\"str\">False</sort_hide_null><cycle type=\"str\">FNxL</cycle><lat type=\"str\">43.16524</lat><sort_null_only type=\"str\">False</sort_null_only><page type=\"str\">4894426</page><election_year type=\"str\"></election_year><federal_funds_flag type=\"str\">False</federal_funds_flag><party type=\"str\"></party><name type=\"str\">aKPjF</name></root>\'',
324
  0.35,
@@ -339,6 +296,48 @@ examples = [
339
  0.35,
340
  "XML"
341
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  ]
343
 
344
  with gr.Blocks(title="Gravitee BERT PII") as demo:
 
202
  def ner(text: str, threshold: float, data_type: str = None, format_input: bool = False) -> List[Tuple[str, str]]:
203
  """Main NER function for Gradio interface"""
204
  # Format text if requested
205
+ if format_input and data_type and data_type != "Documents":
206
  formatted_text = format_text(text, data_type)
207
  result = predict_entities(formatted_text, threshold)
208
  display_text = formatted_text
 
232
 
233
  return highlighted
234
 
 
235
  examples = [
236
+ # JSON samples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  [
238
  '{\"api_key\": \"9ewl5\", \"page\": \"82\", \"max_primary_general_date\": \"1998-02-01\", \"sort\": \"nz siw\", \"election_type_id\": \"guerv jgwbunon guerv\", \"election_district\": \"03vpuute\", \"max_election_date\": \"1980-12-30\", \"sort_null_only\": \"false\", \"min_election_date\": \"2003-03-05\", \"per_page\": \"96\", \"min_primary_general_date\": \"1991-05-29\", \"election_state\": \"f9u4gfgt pzji\", \"election_party\": \"\", \"min_update_date\": \"1998-01-26\", \"sort_nulls_last\": \"false\", \"max_create_date\": \"1970-10-19\", \"office_sought\": \"rz1thr5zp\", \"max_update_date\": \"2018-12-12\", \"sort_hide_null\": \"true\", \"election_year\": \"alrcfqpswf\", \"min_create_date\": \"2003-02-18\"}',
239
  0.35,
 
254
  0.35,
255
  "JSON"
256
  ],
257
+ # SQL samples
258
  [
259
  'SELECT \"endTime,startTime,age,nation_woman,national identity,arline name,airport_icao,coordinate,api-version\",\"api-version\",CASE WHEN \"endTime\" THEN \'skin\' WHEN \"startTime\"=\'1992-01-13 23:33:10\' THEN \'president\' WHEN \"age\"=\'31\' THEN \'be\' WHEN \"nation_woman\"=\'syrian\' THEN \'particular\' WHEN \"national identity\"<>\'600233955\' THEN \'trip\' WHEN \"arline name\"<>\'Shanghai Airlines\' THEN \'present\' WHEN \"airport_icao\"<>\'SBJP\' THEN \'forget\' WHEN \"coordinate\"=\'52.297060\' THEN \'car\' WHEN \"api-version\" THEN \'also\' END FROM \"not\" WHERE \"endTime\" AND \"startTime\"=\'1973-12-27 11:08:01\' AND (\"age\"=\'64\' OR \"age\"=\'answer\') AND \"nation_woman\"<>\'guyanese\' AND \"national identity\"<>\'142451774\' AND \"arline name\" AND \"airport_icao\" AND \"coordinate\"=\'46.828790\' AND (\"api-version\"=\'KOikhS KOikhS yz\' OR \"api-version\"=\'activity\') LIMIT 64',
260
  0.35,
 
275
  0.35,
276
  "SQL"
277
  ],
278
+ # XML samples
279
  [
280
  'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><sort type=\"str\"></sort><incumbent_challenge type=\"str\"></incumbent_challenge><longitude type=\"str\">-97.518538</longitude><has_raised_funds type=\"str\">True</has_raised_funds><airport type=\"str\">John F Kennedy International airport</airport><office type=\"str\">IDuqbH m</office><candidate_status type=\"str\">qEw3Tpc wmYqRUtTH</candidate_status><district type=\"str\">D UCd6ZAFD D</district><sort_nulls_last type=\"str\">False</sort_nulls_last><per_page type=\"str\">7720</per_page><state type=\"str\">South Dakota</state><location type=\"str\">-109.575655</location><airport_icao type=\"str\">EDDH</airport_icao><api_key type=\"str\">46nCNe0 Wj Wj</api_key><origin_airport_code type=\"str\">DEN</origin_airport_code><year type=\"str\">1996</year><sort_hide_null type=\"str\">False</sort_hide_null><cycle type=\"str\">FNxL</cycle><lat type=\"str\">43.16524</lat><sort_null_only type=\"str\">False</sort_null_only><page type=\"str\">4894426</page><election_year type=\"str\"></election_year><federal_funds_flag type=\"str\">False</federal_funds_flag><party type=\"str\"></party><name type=\"str\">aKPjF</name></root>\'',
281
  0.35,
 
296
  0.35,
297
  "XML"
298
  ],
299
+ # HTML samples
300
+ [
301
+ '<table border=\"1\"><tr><th>api_key</th><td>PmtrSlgEzO PmtrSlgEzO br</td></tr><tr><th>page</th><td>73595</td></tr><tr><th>max_primary_general_date</th><td>1992-09-22</td></tr><tr><th>sort</th><td>RqJu PZwhjrbcS</td></tr><tr><th>election_type_id</th><td>PFTZDOBxIl</td></tr><tr><th>election_district</th><td>XNc7rk</td></tr><tr><th>max_election_date</th><td>2007-02-15</td></tr><tr><th>sort_null_only</th><td>False</td></tr><tr><th>min_election_date</th><td>2014-06-27</td></tr><tr><th>per_page</th><td>62971536</td></tr><tr><th>min_primary_general_date</th><td>1982-03-22</td></tr><tr><th>election_state</th><td>xzJis</td></tr><tr><th>election_party</th><td>lHUet 1vtAg5J lHUet</td></tr><tr><th>min_update_date</th><td>1984-07-25</td></tr><tr><th>sort_nulls_last</th><td>False</td></tr><tr><th>max_create_date</th><td>1980-01-02</td></tr><tr><th>max_update_date</th><td>1997-11-10</td></tr><tr><th>sort_hide_null</th><td>True</td></tr><tr><th>election_year</th><td>hNf2nYGMbX</td></tr><tr><th>min_create_date</th><td>2000-11-25</td></tr></table>',
302
+ 0.35,
303
+ "HTML"
304
+ ],
305
+ [
306
+ '<table border=\"1\"><tr><th>religion</th><td>Christianity</td></tr><tr><th>api-version</th><td>dCwMNqR</td></tr><tr><th>to_contact</th><td>VirginiaTBarrett@fleckens.hu</td></tr><tr><th>spot</th><td>6765 2278 Norma Avenue Mcbee , SC 33987</td></tr><tr><th>endTime</th><td>2022-09-07 14:17:30</td></tr><tr><th>startTime</th><td>2001-09-20 20:45:43</td></tr><tr><th>facility</th><td>Apt. 074</td></tr><tr><th>vocation</th><td>Lay-out worker</td></tr><tr><th>alley</th><td>1697 2496 White Pine Lane Apt. 904</td></tr></table>',
307
+ 0.35,
308
+ "HTML"
309
+ ],
310
+ [
311
+ '<table border=\"1\"><tr><th>imei</th><td>25-894407-891989-9</td></tr><tr><th>post-code</th><td>2142</td></tr><tr><th>startTime</th><td>2001-06-20 10:16:33</td></tr><tr><th>timeGrain</th><td></td></tr><tr><th>longitude</th><td>-70.990988</td></tr><tr><th>latitude</th><td>42.32382</td></tr><tr><th>endTime</th><td>1971-08-20 19:09:13</td></tr><tr><th>api-version</th><td>u zNS zNS</td></tr><tr><th>key store password</th><td>teiy1oD5ie</td></tr><tr><th>bank account</th><td>FILW85959012098599</td></tr></table>',
312
+ 0.35,
313
+ "HTML"
314
+ ],
315
+ [
316
+ '<table border=\"1\"><tr><th>country</th><td>United States</td></tr><tr><th>address</th><td>0133 2669 Locust Street Suite 601 Fort Gaines United States</td></tr><tr><th>project</th><td></td></tr><tr><th>nation_plural</th><td>vietnameses</td></tr><tr><th>urban__area</th><td>Buena Park</td></tr><tr><th>region</th><td>California</td></tr><tr><th>street</th><td>01474 3910 Melody Lane Apt. 383</td></tr><tr><th>phone-country-code</th><td>US</td></tr><tr><th>spot</th><td>Apt. 554</td></tr></table>',
317
+ 0.35,
318
+ "HTML"
319
+ ],
320
+ # Natural Text examples
321
+ [
322
+ "Dr. Sarah Martinez, age 34, works as a Senior Data Scientist at TechCorp International. Her employee ID is TC-DS-5591 and she joined the company on 2019-03-15. Sarah lives at 1247 Oak Avenue, Apartment 5B, Portland, Oregon 97205. Her work phone is 503-555-0147 and personal email is sarah.martinez@personalmail.com. For banking, she uses account TCBK89012345678901 at First National Bank. Her driver's license number is OR-DL-M8829134 and her social security number is 123-45-6789. She recently traveled to London using passport US-P-543216789 and her frequent flyer number with Delta Airlines is DL987654321.",
323
+ 0.35,
324
+ "Documents"
325
+ ],
326
+ [
327
+ "The customer database contains the following entries: Michael Chen (DOB: 1985-07-22, age 38) residing at 789 Pine Street, Suite 200, San Francisco, CA 94102. His contact details include phone 415-555-0298 and email michael.chen@businessmail.org. Financial information: Chase Bank account CH-5567889012345678, credit card 4532-1234-5678-9012 (exp: 08/2027, CVV: 451). Professional details: Software Engineer at InnovateTech LLC, employee ID IT-SE-7793, salary $125,000. Government IDs include SSN 987-65-4321, California driver's license CA-DL-B1234567, and passport number US-578912345. His device MAC address is aa:bb:cc:dd:ee:ff and IMEI 358240051111110.",
328
+ 0.35,
329
+ "Documents"
330
+ ],
331
+ [
332
+ "Security incident report for Lisa Thompson (ID: LT-2023-001): On 2023-11-15 at 14:30 PST, user accessed system from IP address 192.168.1.100 using API key api_key_abc123xyz789. Employee details: Lisa Thompson, age 29, title Senior Security Analyst, department Cybersecurity, hired 2021-09-01. Home address: 456 Maple Drive, Unit 3C, Seattle, WA 98109. Contact: phone 206-555-0189, work email lisa.thompson@company.com. Banking: Wells Fargo account WF-4455667788990011, routing number 021000021. Government IDs: SSN 555-44-3333, WA driver's license WA-DL-THOMP567, passport US-890123456. Vehicle: 2020 Honda Civic, license plate WA-ABC1234, VIN 1HGBH41JXMN109186.",
333
+ 0.35,
334
+ "Documents"
335
+ ],
336
+ [
337
+ "Patient intake form: Dr. Robert Kim (Medical License: MD-12345-WA), age 42, practices at Seattle General Hospital, 1500 Medical Center Drive, Seattle, WA 98101. Phone: 206-555-0234, fax: 206-555-0235, email: dr.kim@seattlegeneral.org. Patient information: Jennifer Walsh, DOB 1990-12-03 (age 33), SSN 111-22-3333, address 2100 Broadway Ave, Apt 15D, Seattle, WA 98122. Insurance: Blue Cross Blue Shield, policy BC-556677889900, group 12345. Emergency contact: Mark Walsh (spouse), phone 206-555-0167. Medical history includes prescription for Medication XYZ, DEA number DR1234567. Appointment scheduled for 2024-01-20 at 10:00 AM, confirmation code CONF-789456.",
338
+ 0.35,
339
+ "Documents"
340
+ ],
341
  ]
342
 
343
  with gr.Blocks(title="Gravitee BERT PII") as demo: