lhoestq HF staff commited on
Commit
c143e76
1 Parent(s): e6ef189

update app

Browse files
Files changed (1) hide show
  1. app.py +35 -5
app.py CHANGED
@@ -1,13 +1,46 @@
 
 
 
1
  import gradio as gr
2
  import requests
3
  import pandas as pd
 
4
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
5
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def analyze_dataset(dataset: str) -> pd.DataFrame:
8
- yield f"Presidio scan results for {dataset}:", pd.DataFrame({"type": [], "text": [], "row_idx": [], "column_name": []})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- iface = gr.Interface(
11
  fn=analyze_dataset,
12
  inputs=[
13
  HuggingfaceHubSearch(
@@ -24,7 +57,4 @@ iface = gr.Interface(
24
  description="The space takes an HF dataset name as an input, and returns the list of entities detected by Presidio in the first samples.",
25
  )
26
 
27
- with gr.Blocks() as demo:
28
- iface.render()
29
-
30
  demo.launch()
 
1
+ from itertools import count
2
+ from typing import Any
3
+
4
  import gradio as gr
5
  import requests
6
  import pandas as pd
7
+ from datasets import Features
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
+ from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
11
+
12
+ def stream_rows() -> Iterable[dict[str, Any]]:
13
+ batch_size = 100
14
+ for i in count():
15
+ rows_resp = requests.get(f"https://datasets_server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
16
+ if "error" in rows_resp:
17
+ raise RuntimeError(rows_resp["error"])
18
+ if not rows_resp["rows"]:
19
+ break
20
+ for row_item in rows_resp["rows"]:
21
+ yield row_item["row"]
22
 
23
  def analyze_dataset(dataset: str) -> pd.DataFrame:
24
+ info_resp = requests.get(f"https://datasets_server.huggingface.co/info?dataset={dataset}", timeout=3).json()
25
+ if "error" in info_resp:
26
+ yield "❌ " + info_resp["error"], pd.DataFrame()
27
+ return
28
+ config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
29
+ features = Features.from_dict(info_resp["dataset_info"][config]["features"])
30
+ split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
31
+ scanned_columns = get_columns_with_strings(features)
32
+ columns_descriptions = [
33
+ get_column_description(column_name, features[column_name]) for column_name in scanned_columns
34
+ ]
35
+ rows = stream_rows(dataset, config, split)
36
+ presidio_entities = []
37
+ for presidio_entity in presidio_scan_entities(
38
+ rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
39
+ ):
40
+ presidio_entities.append(presidio_entity)
41
+ yield f"Presidio scan results for {dataset}:", pd.DataFrame(presidio_entities)
42
 
43
+ demo = gr.Interface(
44
  fn=analyze_dataset,
45
  inputs=[
46
  HuggingfaceHubSearch(
 
57
  description="The space takes an HF dataset name as an input, and returns the list of entities detected by Presidio in the first samples.",
58
  )
59
 
 
 
 
60
  demo.launch()