Spaces:

lhoestq
/

presidio-dataset-scanner

Sleeping

File size: 5,607 Bytes

import re
from itertools import count, islice
from typing import Any, Iterable, Literal, Optional, TypedDict, TypeVar, Union, overload

from datasets import Features, Value, get_dataset_config_info
from datasets.features.features import FeatureType, _visit
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult


Row = dict[str, Any]
T = TypeVar("T")
BATCH_SIZE = 1
MAX_TEXT_LENGTH = 500
analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer)


class PresidioEntity(TypedDict):
    text: str
    type: str
    row_idx: int
    column_name: str


@overload
def batched(it: Iterable[T], n: int) -> Iterable[list[T]]:
    ...


@overload
def batched(it: Iterable[T], n: int, with_indices: Literal[False]) -> Iterable[list[T]]:
    ...


@overload
def batched(it: Iterable[T], n: int, with_indices: Literal[True]) -> Iterable[tuple[list[int], list[T]]]:
    ...


def batched(
    it: Iterable[T], n: int, with_indices: bool = False
) -> Union[Iterable[list[T]], Iterable[tuple[list[int], list[T]]]]:
    it, indices = iter(it), count()
    while batch := list(islice(it, n)):
        yield (list(islice(indices, len(batch))), batch) if with_indices else batch


def mask(text: str) -> str:
    return " ".join(
        word[: min(2, len(word) - 1)] + re.sub("[A-Za-z0-9]", "*", word[min(2, len(word) - 1) :])
        for word in text.split(" ")
    )


def get_strings(row_content: Any) -> str:
    if isinstance(row_content, str):
        return row_content
    if isinstance(row_content, dict):
        if "src" in row_content:
            return ""  # could be image or audio
        row_content = list(row_content.values())
    if isinstance(row_content, list):
        str_items = (get_strings(row_content_item) for row_content_item in row_content)
        return "\n".join(str_item for str_item in str_items if str_item)
    return ""


def _simple_analyze_iterator_cache(
    batch_analyzer: BatchAnalyzerEngine,
    texts: Iterable[str],
    language: str,
    score_threshold: float,
    cache: dict[str, list[RecognizerResult]],
) -> list[list[RecognizerResult]]:
    not_cached_results = iter(
        batch_analyzer.analyze_iterator(
            (text for text in texts if text not in cache), language=language, score_threshold=score_threshold
        )
    )
    results = [cache[text] if text in cache else next(not_cached_results) for text in texts]
    # cache the last results
    cache.clear()
    cache.update(dict(zip(texts, results)))
    return results


def analyze(
    batch_analyzer: BatchAnalyzerEngine,
    batch: list[dict[str, str]],
    indices: Iterable[int],
    scanned_columns: list[str],
    columns_descriptions: list[str],
    cache: Optional[dict[str, list[RecognizerResult]]] = None,
) -> list[PresidioEntity]:
    cache = {} if cache is None else cache
    texts = [
        f"The following is {columns_description} data:\n\n{example[column_name] or ''}"
        for example in batch
        for column_name, columns_description in zip(scanned_columns, columns_descriptions)
    ]
    return [
        PresidioEntity(
            text=texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end],
            type=recognizer_result.entity_type,
            row_idx=row_idx,
            column_name=column_name,
        )
        for i, row_idx, recognizer_row_results in zip(
            count(),
            indices,
            batched(_simple_analyze_iterator_cache(batch_analyzer, texts, language="en", score_threshold=0.8, cache=cache), len(scanned_columns)),
        )
        for j, column_name, columns_description, recognizer_results in zip(
            count(), scanned_columns, columns_descriptions, recognizer_row_results
        )
        for recognizer_result in recognizer_results
        if recognizer_result.start >= len(f"The following is {columns_description} data:\n\n")
    ]


def presidio_scan_entities(
    rows: Iterable[Row], scanned_columns: list[str], columns_descriptions: list[str]
) -> Iterable[PresidioEntity]:
    cache: dict[str, list[RecognizerResult]] = {}
    rows_with_scanned_columns_only = (
        {column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
    )
    for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
        yield from analyze(
            batch_analyzer=batch_analyzer,
            batch=batch,
            indices=indices,
            scanned_columns=scanned_columns,
            columns_descriptions=columns_descriptions,
            cache=cache,
        )


def get_columns_with_strings(features: Features) -> list[str]:
    columns_with_strings: list[str] = []

    for column, feature in features.items():
        str_column = str(column)
        with_string = False

        def classify(feature: FeatureType) -> None:
            nonlocal with_string
            if isinstance(feature, Value) and feature.dtype == "string":
                with_string = True

        _visit(feature, classify)
        if with_string:
            columns_with_strings.append(str_column)
    return columns_with_strings


def get_column_description(column_name: str, feature: FeatureType) -> str:
    nested_fields: list[str] = []

    def get_nested_field_names(feature: FeatureType) -> None:
        nonlocal nested_fields
        if isinstance(feature, dict):
            nested_fields += list(feature)

    _visit(feature, get_nested_field_names)
    return f"{column_name} (with {', '.join(nested_fields)})" if nested_fields else column_name