lhoestq HF staff commited on
Commit
05daa8e
1 Parent(s): 47aa39b

missing args + set limits

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from itertools import count
2
  from typing import Any, Iterable
3
 
4
  import gradio as gr
@@ -9,7 +9,10 @@ from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
  from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
- def stream_rows() -> Iterable[dict[str, Any]]:
 
 
 
13
  batch_size = 100
14
  for i in count():
15
  rows_resp = requests.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
@@ -32,11 +35,11 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
32
  columns_descriptions = [
33
  get_column_description(column_name, features[column_name]) for column_name in scanned_columns
34
  ]
35
- rows = stream_rows(dataset, config, split)
36
  presidio_entities = []
37
- for presidio_entity in presidio_scan_entities(
38
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
39
- ):
40
  presidio_entities.append(presidio_entity)
41
  yield f"Presidio scan results for {dataset}:", pd.DataFrame(presidio_entities)
42
 
 
1
+ from itertools import count, islice
2
  from typing import Any, Iterable
3
 
4
  import gradio as gr
 
9
 
10
  from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
+ MAX_ENTITIES = 100
13
+ MAX_ROWS = 100
14
+
15
+ def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
16
  batch_size = 100
17
  for i in count():
18
  rows_resp = requests.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
 
35
  columns_descriptions = [
36
  get_column_description(column_name, features[column_name]) for column_name in scanned_columns
37
  ]
38
+ rows = islice(stream_rows(dataset, config, split), MAX_ROWS)
39
  presidio_entities = []
40
+ for presidio_entity in islice(presidio_scan_entities(
41
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
42
+ ), MAX_ENTITIES):
43
  presidio_entities.append(presidio_entity)
44
  yield f"Presidio scan results for {dataset}:", pd.DataFrame(presidio_entities)
45