lhoestq's picture
lhoestq HF staff
missing args + set limits
05daa8e
raw
history blame
2.58 kB
from itertools import count, islice
from typing import Any, Iterable
import gradio as gr
import requests
import pandas as pd
from datasets import Features
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
MAX_ENTITIES = 100
MAX_ROWS = 100
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
batch_size = 100
for i in count():
rows_resp = requests.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
if "error" in rows_resp:
raise RuntimeError(rows_resp["error"])
if not rows_resp["rows"]:
break
for row_item in rows_resp["rows"]:
yield row_item["row"]
def analyze_dataset(dataset: str) -> pd.DataFrame:
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
if "error" in info_resp:
yield "❌ " + info_resp["error"], pd.DataFrame()
return
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
features = Features.from_dict(info_resp["dataset_info"][config]["features"])
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
scanned_columns = get_columns_with_strings(features)
columns_descriptions = [
get_column_description(column_name, features[column_name]) for column_name in scanned_columns
]
rows = islice(stream_rows(dataset, config, split), MAX_ROWS)
presidio_entities = []
for presidio_entity in islice(presidio_scan_entities(
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
), MAX_ENTITIES):
presidio_entities.append(presidio_entity)
yield f"Presidio scan results for {dataset}:", pd.DataFrame(presidio_entities)
demo = gr.Interface(
fn=analyze_dataset,
inputs=[
HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
),
],
outputs=[
gr.Markdown(),
gr.DataFrame(),
],
title="Scan datasets using Presidio",
description="The space takes an HF dataset name as an input, and returns the list of entities detected by Presidio in the first samples.",
)
demo.launch()