lhoestq HF staff commited on
Commit
880a98d
β€’
1 Parent(s): bb49074

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -4
app.py CHANGED
@@ -11,6 +11,19 @@ from analyze import analyzer, get_column_description, get_columns_with_strings,
11
 
12
  MAX_ROWS = 100
13
  T = TypeVar("T")
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
16
  batch_size = 100
@@ -52,7 +65,7 @@ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFram
52
  for presidio_entity in presidio_scan_entities(
53
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
54
  ):
55
- if presidio_entity.type in enabled_presidio_entities:
56
  presidio_entities.append(presidio_entity)
57
  yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
58
  yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
@@ -68,8 +81,8 @@ with gr.Blocks() as demo:
68
  ),
69
  gr.CheckBoxGroup(
70
  label="Presidio entities",
71
- choices=analyzer.get_supported_entities(),
72
- value=["PERSON", "CREDIT_CARD", "US_SSN", "PHONE_NUMBER", "EMAIL_ADDRESS", "IP_ADDRESS", "US_BANK_NUMBER", "EMAIL", "IBAN_CODE"],
73
  interative=True,
74
  ),
75
  ]
@@ -80,7 +93,14 @@ with gr.Blocks() as demo:
80
  ]
81
  button.click(analyze_dataset, inputs, outputs)
82
  gr.Examples(
83
- [["microsoft/orca-math-word-problems-200k"], ["tatsu-lab/alpaca"], ["Anthropic/hh-rlhf"], ["OpenAssistant/oasst1"], ["sidhq/email-thread-summary"], ["lhoestq/fake_name_and_ssn"]],
 
 
 
 
 
 
 
84
  inputs,
85
  outputs,
86
  fn=analyze_dataset,
 
11
 
12
  MAX_ROWS = 100
13
  T = TypeVar("T")
14
+ DEFAULT_PRESIDIO_ENTITIES = sorted([
15
+ 'PERSON',
16
+ 'CREDIT_CARD',
17
+ 'US_SSN',
18
+ 'US_DRIVER_LICENSE',
19
+ 'PHONE_NUMBER',
20
+ 'US_PASSPORT',
21
+ 'EMAIL_ADDRESS',
22
+ 'IP_ADDRESS',
23
+ 'US_BANK_NUMBER',
24
+ 'IBAN_CODE',
25
+ 'EMAIL',
26
+ ])
27
 
28
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
29
  batch_size = 100
 
65
  for presidio_entity in presidio_scan_entities(
66
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
67
  ):
68
+ if presidio_entity["type"] in enabled_presidio_entities:
69
  presidio_entities.append(presidio_entity)
70
  yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
71
  yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
 
81
  ),
82
  gr.CheckBoxGroup(
83
  label="Presidio entities",
84
+ choices=sorted(analyzer.get_supported_entities()),
85
+ value=DEFAULT_PRESIDIO_ENTITIES,
86
  interative=True,
87
  ),
88
  ]
 
93
  ]
94
  button.click(analyze_dataset, inputs, outputs)
95
  gr.Examples(
96
+ [
97
+ ["microsoft/orca-math-word-problems-200k", DEFAULT_PRESIDIO_ENTITIES],
98
+ ["tatsu-lab/alpaca", DEFAULT_PRESIDIO_ENTITIES],
99
+ ["Anthropic/hh-rlhf", DEFAULT_PRESIDIO_ENTITIES],
100
+ ["OpenAssistant/oasst1", DEFAULT_PRESIDIO_ENTITIES],
101
+ ["sidhq/email-thread-summary", DEFAULT_PRESIDIO_ENTITIES],
102
+ ["lhoestq/fake_name_and_ssn", DEFAULT_PRESIDIO_ENTITIES]
103
+ ],
104
  inputs,
105
  outputs,
106
  fn=analyze_dataset,