lhoestq HF staff commited on
Commit
d0ca54b
β€’
1 Parent(s): 797cf6b
Files changed (2) hide show
  1. analyze.py +5 -6
  2. app.py +44 -13
analyze.py CHANGED
@@ -46,11 +46,10 @@ def batched(
46
 
47
 
48
  def mask(text: str) -> str:
49
- return text # don't apply mask for demo
50
- # return " ".join(
51
- # word[: min(2, len(word) - 1)] + re.sub("[A-Za-z0-9]", "*", word[min(2, len(word) - 1) :])
52
- # for word in text.split(" ")
53
- # )
54
 
55
 
56
  def get_strings(row_content: Any) -> str:
@@ -101,7 +100,7 @@ def analyze(
101
  ]
102
  return [
103
  PresidioEntity(
104
- text=mask(texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end]),
105
  type=recognizer_result.entity_type,
106
  row_idx=row_idx,
107
  column_name=column_name,
 
46
 
47
 
48
  def mask(text: str) -> str:
49
+ return " ".join(
50
+ word[: min(2, len(word) - 1)] + re.sub("[A-Za-z0-9]", "*", word[min(2, len(word) - 1) :])
51
+ for word in text.split(" ")
52
+ )
 
53
 
54
 
55
  def get_strings(row_content: Any) -> str:
 
100
  ]
101
  return [
102
  PresidioEntity(
103
+ text=texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end],
104
  type=recognizer_result.entity_type,
105
  row_idx=row_idx,
106
  column_name=column_name,
app.py CHANGED
@@ -1,4 +1,6 @@
1
- from itertools import count, islice
 
 
2
  from typing import Any, Iterable, TypeVar
3
 
4
  import gradio as gr
@@ -7,7 +9,7 @@ import pandas as pd
7
  from datasets import Features
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
- from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
  MAX_ROWS = 100
13
  T = TypeVar("T")
@@ -24,6 +26,22 @@ DEFAULT_PRESIDIO_ENTITIES = sorted([
24
  'IBAN_CODE',
25
  'EMAIL',
26
  ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
29
  batch_size = 100
@@ -47,7 +65,16 @@ class track_iter:
47
  self.next_idx += 1
48
  yield item
49
 
50
- def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
51
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
52
  if "error" in info_resp:
53
  yield "❌ " + info_resp["error"], pd.DataFrame()
@@ -65,10 +92,12 @@ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFram
65
  for presidio_entity in presidio_scan_entities(
66
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
67
  ):
 
 
68
  if presidio_entity["type"] in enabled_presidio_entities:
69
  presidio_entities.append(presidio_entity)
70
- yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
71
- yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
72
 
73
  with gr.Blocks() as demo:
74
  gr.Markdown("# Scan datasets using Presidio")
@@ -85,26 +114,28 @@ with gr.Blocks() as demo:
85
  value=DEFAULT_PRESIDIO_ENTITIES,
86
  interactive=True,
87
  ),
 
88
  ]
89
  button = gr.Button("Run Presidio Scan")
90
  outputs = [
91
- gr.Markdown(),
92
  gr.DataFrame(),
93
  ]
94
  button.click(analyze_dataset, inputs, outputs)
95
  gr.Examples(
96
  [
97
- ["microsoft/orca-math-word-problems-200k", DEFAULT_PRESIDIO_ENTITIES],
98
- ["tatsu-lab/alpaca", DEFAULT_PRESIDIO_ENTITIES],
99
- ["Anthropic/hh-rlhf", DEFAULT_PRESIDIO_ENTITIES],
100
- ["OpenAssistant/oasst1", DEFAULT_PRESIDIO_ENTITIES],
101
- ["sidhq/email-thread-summary", DEFAULT_PRESIDIO_ENTITIES],
102
- ["lhoestq/fake_name_and_ssn", DEFAULT_PRESIDIO_ENTITIES]
103
  ],
104
  inputs,
105
  outputs,
106
  fn=analyze_dataset,
107
- run_on_click=True
 
108
  )
109
 
110
  demo.launch()
 
1
+ from collections import Counter
2
+ from itertools import count, groupby, islice
3
+ from operator import itemgetter
4
  from typing import Any, Iterable, TypeVar
5
 
6
  import gradio as gr
 
9
  from datasets import Features
10
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
11
 
12
+ from analyze import PresidioEntity, analyzer, get_column_description, get_columns_with_strings, mask, presidio_scan_entities
13
 
14
  MAX_ROWS = 100
15
  T = TypeVar("T")
 
26
  'IBAN_CODE',
27
  'EMAIL',
28
  ])
29
+ WARNING_PRESIDIO_ENTITIES = sorted([
30
+ 'PHONE_NUMBER',
31
+ 'US_PASSPORT',
32
+ 'EMAIL_ADDRESS',
33
+ 'IP_ADDRESS',
34
+ 'US_BANK_NUMBER',
35
+ 'IBAN_CODE',
36
+ 'EMAIL',
37
+ ])
38
+ ALERT_PRESIDIO_ENTITIES = sorted([
39
+ 'CREDIT_CARD',
40
+ 'US_SSN',
41
+ 'US_PASSPORT',
42
+ 'US_BANK_NUMBER',
43
+ 'IBAN_CODE',
44
+ ])
45
 
46
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
47
  batch_size = 100
 
65
  self.next_idx += 1
66
  yield item
67
 
68
+
69
+ def presidio_report(presidio_entities: list[PresidioEntity], next_row_idx: int, num_rows: int) -> dict[str, float]:
70
+ title = f"Scan finished: {len(presidio_entities)} entities found" if num_rows == next_row_idx else "Scan in progress..."
71
+ counter = Counter([title] * next_row_idx)
72
+ for row_idx, presidio_entities_per_row in groupby(presidio_entities, itemgetter("row_idx")):
73
+ counter.update(set("% of rows with " + presidio_entity["type"] for presidio_entity in presidio_entities_per_row))
74
+ return dict((presidio_entity_type, presidio_entity_type_row_count / num_rows) for presidio_entity_type, presidio_entity_type_row_count in counter.most_common())
75
+
76
+
77
+ def analyze_dataset(dataset: str, enabled_presidio_entities: list[str] = DEFAULT_PRESIDIO_ENTITIES, show_texts_without_masks: bool = False) -> pd.DataFrame:
78
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
79
  if "error" in info_resp:
80
  yield "❌ " + info_resp["error"], pd.DataFrame()
 
92
  for presidio_entity in presidio_scan_entities(
93
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
94
  ):
95
+ if not show_texts_without_masks:
96
+ presidio_entity["text"] = mask(presidio_entity["text"])
97
  if presidio_entity["type"] in enabled_presidio_entities:
98
  presidio_entities.append(presidio_entity)
99
+ yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
100
+ yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
101
 
102
  with gr.Blocks() as demo:
103
  gr.Markdown("# Scan datasets using Presidio")
 
114
  value=DEFAULT_PRESIDIO_ENTITIES,
115
  interactive=True,
116
  ),
117
+ gr.Checkbox(label="Show texts without masks", value=False),
118
  ]
119
  button = gr.Button("Run Presidio Scan")
120
  outputs = [
121
+ gr.Label(show_label=False),
122
  gr.DataFrame(),
123
  ]
124
  button.click(analyze_dataset, inputs, outputs)
125
  gr.Examples(
126
  [
127
+ ["microsoft/orca-math-word-problems-200k"],
128
+ ["tatsu-lab/alpaca"],
129
+ ["Anthropic/hh-rlhf"],
130
+ ["OpenAssistant/oasst1"],
131
+ ["sidhq/email-thread-summary"],
132
+ ["lhoestq/fake_name_and_ssn"]
133
  ],
134
  inputs,
135
  outputs,
136
  fn=analyze_dataset,
137
+ run_on_click=True,
138
+ cache_examples=False,
139
  )
140
 
141
  demo.launch()