| """ |
| CLI-first programmatic API surface. |
| |
| These functions provide a minimal, runnable Python interface that mirrors the |
| Gradio `api_name` routes, but executes the underlying workflows via the CLI |
| engine (`cli_redact.main(direct_mode_args=...)`). |
| |
| Return values are lists of output file paths created in `output_dir`. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import os |
| import tempfile |
| from pathlib import Path |
| from typing import Any, Iterable |
|
|
|
|
| def _ensure_list(v: str | list[str] | tuple[str, ...]) -> list[str]: |
| if isinstance(v, (list, tuple)): |
| return [str(x) for x in v] |
| return [str(v)] |
|
|
|
|
| def _snapshot_files(folder: str) -> set[str]: |
| root = Path(folder) |
| if not root.exists(): |
| return set() |
| out: set[str] = set() |
| for dirpath, _, filenames in os.walk(root): |
| for name in filenames: |
| out.add(str(Path(dirpath) / name)) |
| return out |
|
|
|
|
| def _default_output_dir(prefix: str) -> str: |
| return tempfile.mkdtemp(prefix=f"doc_redaction_{prefix}_") |
|
|
|
|
| def _run_cli( |
| *, |
| gradio_api_name: str, |
| overrides: dict[str, Any], |
| output_dir: str | None, |
| ) -> list[str]: |
| """ |
| Run cli_redact.main with merged defaults and return newly created files. |
| """ |
| from cli_redact import get_cli_default_args_dict |
| from cli_redact import main as cli_main |
|
|
| merged = get_cli_default_args_dict() |
| merged.update(overrides) |
|
|
| if output_dir is None: |
| output_dir = _default_output_dir(gradio_api_name) |
| merged["output_dir"] = str(output_dir) |
|
|
| before = _snapshot_files(str(output_dir)) |
| cli_main(direct_mode_args=merged) |
| after = _snapshot_files(str(output_dir)) |
|
|
| created = sorted(after - before) |
| return created |
|
|
|
|
| |
| |
| |
|
|
|
|
| def redact_document( |
| input_files: str | list[str], |
| *, |
| output_dir: str | None = None, |
| ocr_method: str | None = None, |
| pii_detector: str | None = None, |
| instruction: str | None = None, |
| overrides: dict[str, Any] | None = None, |
| ) -> list[str]: |
| """ |
| Parity with Gradio `api_name='redact_document'`. |
| Runs CLI task `redact` (PDF/PNG/JPG) or relevant workflow based on file type. |
| """ |
| direct: dict[str, Any] = { |
| "task": "redact", |
| "input_file": _ensure_list(input_files), |
| } |
| if ocr_method is not None: |
| direct["ocr_method"] = ocr_method |
| if pii_detector is not None: |
| direct["pii_detector"] = pii_detector |
| if instruction is not None: |
| direct["custom_llm_instructions"] = instruction |
| if overrides: |
| direct.update(overrides) |
| return _run_cli( |
| gradio_api_name="redact_document", overrides=direct, output_dir=output_dir |
| ) |
|
|
|
|
| def redact_data( |
| input_files: str | list[str], |
| *, |
| output_dir: str | None = None, |
| instruction: str | None = None, |
| overrides: dict[str, Any] | None = None, |
| ) -> list[str]: |
| """Parity with Gradio `api_name='redact_data'` (same CLI task: `redact`).""" |
| direct: dict[str, Any] = {"task": "redact", "input_file": _ensure_list(input_files)} |
| if instruction is not None: |
| direct["custom_llm_instructions"] = instruction |
| if overrides: |
| direct.update(overrides) |
| return _run_cli( |
| gradio_api_name="redact_data", overrides=direct, output_dir=output_dir |
| ) |
|
|
|
|
| def find_duplicate_pages( |
| input_files: str | list[str], |
| *, |
| output_dir: str | None = None, |
| similarity_threshold: float | None = None, |
| min_word_count: int | None = None, |
| min_consecutive_pages: int | None = None, |
| greedy_match: bool | None = None, |
| combine_pages: bool | None = None, |
| overrides: dict[str, Any] | None = None, |
| ) -> list[str]: |
| """Parity with Gradio `api_name='find_duplicate_pages'`.""" |
| direct: dict[str, Any] = { |
| "task": "deduplicate", |
| "duplicate_type": "pages", |
| "input_file": _ensure_list(input_files), |
| } |
| if similarity_threshold is not None: |
| direct["similarity_threshold"] = similarity_threshold |
| if min_word_count is not None: |
| direct["min_word_count"] = min_word_count |
| if min_consecutive_pages is not None: |
| direct["min_consecutive_pages"] = min_consecutive_pages |
| if greedy_match is not None: |
| direct["greedy_match"] = "True" if greedy_match else "False" |
| if combine_pages is not None: |
| direct["combine_pages"] = "True" if combine_pages else "False" |
| if overrides: |
| direct.update(overrides) |
| return _run_cli( |
| gradio_api_name="find_duplicate_pages", overrides=direct, output_dir=output_dir |
| ) |
|
|
|
|
| def find_duplicate_tabular( |
| input_files: str | list[str], |
| *, |
| output_dir: str | None = None, |
| text_columns: list[str] | None = None, |
| similarity_threshold: float | None = None, |
| min_word_count: int | None = None, |
| overrides: dict[str, Any] | None = None, |
| ) -> list[str]: |
| """Parity with Gradio `api_name='find_duplicate_tabular'`.""" |
| direct: dict[str, Any] = { |
| "task": "deduplicate", |
| "duplicate_type": "tabular", |
| "input_file": _ensure_list(input_files), |
| } |
| if text_columns is not None: |
| direct["text_columns"] = list(text_columns) |
| if similarity_threshold is not None: |
| direct["similarity_threshold"] = similarity_threshold |
| if min_word_count is not None: |
| direct["min_word_count"] = min_word_count |
| if overrides: |
| direct.update(overrides) |
| return _run_cli( |
| gradio_api_name="find_duplicate_tabular", |
| overrides=direct, |
| output_dir=output_dir, |
| ) |
|
|
|
|
| def summarise_document( |
| input_files: str | list[str], |
| *, |
| output_dir: str | None = None, |
| overrides: dict[str, Any] | None = None, |
| ) -> list[str]: |
| """Parity with Gradio `api_name='summarise_document'` (CLI task: `summarise`).""" |
| direct: dict[str, Any] = { |
| "task": "summarise", |
| "input_file": _ensure_list(input_files), |
| } |
| if overrides: |
| direct.update(overrides) |
| return _run_cli( |
| gradio_api_name="summarise_document", overrides=direct, output_dir=output_dir |
| ) |
|
|
|
|
| def combine_review_pdfs( |
| input_files: str | list[str], |
| *, |
| output_dir: str | None = None, |
| overrides: dict[str, Any] | None = None, |
| ) -> list[str]: |
| """Parity with Gradio `api_name='combine_review_pdfs'` (CLI task: `combine_review_pdfs`).""" |
| direct: dict[str, Any] = { |
| "task": "combine_review_pdfs", |
| "input_file": _ensure_list(input_files), |
| } |
| if overrides: |
| direct.update(overrides) |
| return _run_cli( |
| gradio_api_name="combine_review_pdfs", overrides=direct, output_dir=output_dir |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def combine_review_csvs( |
| input_files: Iterable[str], |
| *, |
| output_dir: str | None = None, |
| ) -> list[str]: |
| """Parity with Gradio `api_name='combine_review_csvs'`.""" |
| from tools.config import OUTPUT_FOLDER |
| from tools.helper_functions import merge_csv_files |
|
|
| out_dir = str(output_dir or OUTPUT_FOLDER) |
| Path(out_dir).mkdir(parents=True, exist_ok=True) |
| sep = "/" if not out_dir.endswith(("/", "\\")) else "" |
|
|
| return merge_csv_files([str(p) for p in input_files], output_folder=out_dir + sep) |
|
|
|
|
| def export_review_redaction_overlay( |
| *, |
| page_image_path: str, |
| boxes: list[dict[str, Any]], |
| page_number: int = 1, |
| doc_base_name: str = "review", |
| review_df_records: list[dict[str, Any]] | None = None, |
| label_abbrev_chars: int | None = None, |
| ) -> list[str]: |
| """Same behaviour as Gradio ``api_name='page_redaction_review_image'``; Agent API route ``export_review_redaction_overlay``.""" |
| import pandas as pd |
|
|
| from tools.config import OUTPUT_FOLDER |
| from tools.redaction_review import visualise_review_redaction_boxes |
|
|
| annotator: dict[str, Any] = {"image": page_image_path, "boxes": boxes} |
| review_df = pd.DataFrame(review_df_records) if review_df_records else pd.DataFrame() |
|
|
| out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve()) |
| Path(out_dir).mkdir(parents=True, exist_ok=True) |
| out_path = visualise_review_redaction_boxes( |
| annotator, |
| review_df=review_df, |
| output_folder=out_dir, |
| page_number=page_number, |
| doc_base_name=doc_base_name, |
| label_abbrev_chars=label_abbrev_chars, |
| ) |
| return [out_path] if out_path else [] |
|
|
|
|
| def export_review_page_ocr_visualisation( |
| *, |
| page_image_path: str, |
| ocr_results: dict[str, Any], |
| page_number: int = 1, |
| doc_base_name: str = "review", |
| ) -> list[str]: |
| """Same behaviour as Gradio ``api_name='page_ocr_review_image'``; Agent API route ``export_review_page_ocr_visualisation``.""" |
| from PIL import Image |
|
|
| from tools.config import OUTPUT_FOLDER |
| from tools.file_redaction import visualise_ocr_words_bounding_boxes |
|
|
| out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve()) |
| Path(out_dir).mkdir(parents=True, exist_ok=True) |
|
|
| image_name = f"{str(doc_base_name or 'review')}_page{int(page_number)}.png" |
| log_paths: list[str] = [] |
| log_paths = visualise_ocr_words_bounding_boxes( |
| Image.open(page_image_path).convert("RGB"), |
| ocr_results, |
| image_name=image_name, |
| output_folder=out_dir, |
| visualisation_folder="review_ocr_visualisations", |
| add_legend=True, |
| log_files_output_paths=log_paths, |
| ) |
| return list(log_paths) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def load_and_prepare_documents_or_data(*args: Any, **kwargs: Any) -> list[str]: |
| raise NotImplementedError( |
| "load_and_prepare_documents_or_data is Gradio-session-state driven and is not exposed as a single CLI task." |
| ) |
|
|
|
|
| def apply_review_redactions( |
| pdf_path: str, |
| review_csv_path: str, |
| *, |
| output_dir: str | None = None, |
| input_dir: str | None = None, |
| text_extract_method: str | None = None, |
| efficient_ocr: bool | None = None, |
| ) -> list[str]: |
| """ |
| Headless parity with Gradio ``api_name='apply_review_redactions'``. |
| |
| Returns output file paths (redacted PDF, review CSV, logs, etc.). |
| """ |
| from tools.simplified_api import run_apply_review_redactions |
|
|
| r = run_apply_review_redactions( |
| pdf_path=pdf_path, |
| review_csv_path=review_csv_path, |
| output_dir=output_dir, |
| input_dir=input_dir, |
| text_extract_method=text_extract_method, |
| efficient_ocr=efficient_ocr, |
| ) |
| return list(r.get("output_paths") or []) |
|
|
|
|
| def word_level_ocr_text_search(*args: Any, **kwargs: Any) -> list[str]: |
| raise NotImplementedError( |
| "word_level_ocr_text_search is Gradio-session-state driven; no CLI-first equivalent is currently provided." |
| ) |
|
|
|
|
| __all__ = [ |
| "redact_document", |
| "load_and_prepare_documents_or_data", |
| "apply_review_redactions", |
| "export_review_page_ocr_visualisation", |
| "export_review_redaction_overlay", |
| "word_level_ocr_text_search", |
| "redact_data", |
| "find_duplicate_pages", |
| "find_duplicate_tabular", |
| "summarise_document", |
| "combine_review_csvs", |
| "combine_review_pdfs", |
| ] |
|
|