|
from itertools import count, islice |
|
from typing import Any, Iterable, TypedVar |
|
|
|
import gradio as gr |
|
import requests |
|
import pandas as pd |
|
from datasets import Features |
|
from gradio_huggingfacehub_search import HuggingfaceHubSearch |
|
|
|
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities |
|
|
|
MAX_ROWS = 100 |
|
T = TypedVar("T") |
|
|
|
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]: |
|
batch_size = 100 |
|
for i in count(): |
|
rows_resp = requests.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json() |
|
if "error" in rows_resp: |
|
raise RuntimeError(rows_resp["error"]) |
|
if not rows_resp["rows"]: |
|
break |
|
for row_item in rows_resp["rows"]: |
|
yield row_item["row"] |
|
|
|
class track_iter: |
|
|
|
def __init__(self, it: Iterable[T]): |
|
self.it = it |
|
self.next_idx = 0 |
|
|
|
def __iter__(self) -> T: |
|
for item in self.it: |
|
self.next_idx += 1 |
|
yield item |
|
|
|
def analyze_dataset(dataset: str) -> pd.DataFrame: |
|
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json() |
|
if "error" in info_resp: |
|
yield "β " + info_resp["error"], pd.DataFrame() |
|
return |
|
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"])) |
|
features = Features.from_dict(info_resp["dataset_info"][config]["features"]) |
|
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"])) |
|
num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS) |
|
scanned_columns = get_columns_with_strings(features) |
|
columns_descriptions = [ |
|
get_column_description(column_name, features[column_name]) for column_name in scanned_columns |
|
] |
|
rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS)) |
|
presidio_entities = [] |
|
for presidio_entity in presidio_scan_entities( |
|
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions |
|
): |
|
presidio_entities.append(presidio_entity) |
|
yield f"Scanning {dataset} [{rows.next_idx} / {num_rows}]:", pd.DataFrame(presidio_entities) |
|
|
|
demo = gr.Interface( |
|
fn=analyze_dataset, |
|
inputs=[ |
|
HuggingfaceHubSearch( |
|
label="Hub Dataset ID", |
|
placeholder="Search for dataset id on Huggingface", |
|
search_type="dataset", |
|
), |
|
], |
|
outputs=[ |
|
gr.Markdown(), |
|
gr.DataFrame(), |
|
], |
|
title="Scan datasets using Presidio", |
|
description="The space takes an HF dataset name as an input, and returns the list of entities detected by Presidio in the first samples.", |
|
) |
|
|
|
demo.launch() |
|
|