Spaces:

strickvl
/

redaction-detector-streamlit

Runtime error

File size: 7,292 Bytes

b0f2ac0

from io import BytesIO
import os
import pathlib
import tempfile
import time

import fitz
import gradio as gr
import PIL
import skimage
import streamlit as st
from fastai.learner import load_learner
from fastai.vision.all import *
from fpdf import FPDF
from icevision.all import *
from icevision.models.checkpoint import *
from PIL import Image as PILImage


CHECKPOINT_PATH = "./allsynthetic-imgsize768.pth"


@st.cache
def load_icevision_model():
    return model_from_checkpoint(CHECKPOINT_PATH)


@st.cache
def load_fastai_model():
    return load_learner("fastai-classification-model.pkl")


checkpoint_and_model = load_icevision_model()
model = checkpoint_and_model["model"]
model_type = checkpoint_and_model["model_type"]
class_map = checkpoint_and_model["class_map"]

img_size = checkpoint_and_model["img_size"]
valid_tfms = tfms.A.Adapter(
    [*tfms.A.resize_and_pad(img_size), tfms.A.Normalize()]
)


learn = load_fastai_model()
labels = learn.dls.vocab


@st.experimental_memo
def get_content_area(pred_dict) -> int:
    if "content" not in pred_dict["detection"]["labels"]:
        return 0
    content_bboxes = [
        pred_dict["detection"]["bboxes"][idx]
        for idx, label in enumerate(pred_dict["detection"]["labels"])
        if label == "content"
    ]
    cb = content_bboxes[0]
    return (cb.xmax - cb.xmin) * (cb.ymax - cb.ymin)


@st.experimental_memo
def get_redaction_area(pred_dict) -> int:
    if "redaction" not in pred_dict["detection"]["labels"]:
        return 0
    redaction_bboxes = [
        pred_dict["detection"]["bboxes"][idx]
        for idx, label in enumerate(pred_dict["detection"]["labels"])
        if label == "redaction"
    ]
    return sum(
        (bbox.xmax - bbox.xmin) * (bbox.ymax - bbox.ymin)
        for bbox in redaction_bboxes
    )


st.title("Redaction Detector")

st.image(
    "./synthetic-redactions.jpg",
    width=300,
)
uploaded_pdf = st.file_uploader(
    "Upload a PDF...",
    type="pdf",
    accept_multiple_files=False,
    help="This application processes PDF files. Please upload a document you believe to contain redactions.",
    on_change=None,
)

# Add a selectbox to the sidebar:
st.sidebar.header("Customisation Options")

graph_checkbox = st.sidebar.checkbox(
    "Show analysis charts",
    value=True,
    help="Display charts analysising the redactions found in the document.",
)

extract_images_checkbox = st.sidebar.checkbox(
    "Extract redacted images",
    value=True,
    help="Create a PDF file containing the redacted images with an object detection overlay highlighting their locations and the confidence the model had when detecting the redactions.",
)

# Add a slider to the sidebar:
confidence = st.sidebar.slider(
    "Confidence level (%)",
    min_value=0,
    max_value=100,
    value=80,
)


@st.cache
def get_pdf_document(input):
    with open(
        pathlib.Path(filename_without_extension / "output.pdf"), "wb"
    ) as f:
        f.write(uploaded_pdf.getbuffer())
    return fitz.open("output.pdf")


@st.cache
def get_image_predictions(img):
    return model_type.end2end_detect(
        img,
        valid_tfms,
        model,
        class_map=class_map,
        detection_threshold=confidence / 100,
        display_label=True,
        display_bbox=True,
        return_img=True,
        font_size=16,
        label_color="#FF59D6",
    )


if uploaded_pdf is None:
    st.markdown(pathlib.Path("article.md").read_text())
else:
    st.text("Opening PDF...")
    filename_without_extension = uploaded_pdf.name[:-4]
    results = []
    images = []
    document = get_pdf_document(uploaded_pdf)
    total_image_areas = 0
    total_content_areas = 0
    total_redaction_area = 0
    tmp_dir = tempfile.gettempdir()

    for page_num, page in enumerate(document, start=1):
        image_pixmap = page.get_pixmap()
        image = image_pixmap.tobytes()
        _, _, probs = learn.predict(image)
        results.append(
            {labels[i]: float(probs[i]) for i in range(len(labels))}
        )
        if probs[0] > (confidence / 100):
            redaction_count = len(images)
            if not os.path.exists(
                os.path.join(tmp_dir, filename_without_extension or "abc")
            ):
                os.makedirs(os.path.join(tmp_dir, filename_without_extension))
            image_pixmap.save(
                os.path.join(
                    tmp_dir, filename_without_extension, f"page-{page_num}.png"
                )
            )
            images.append(
                [
                    f"Redacted page #{redaction_count + 1} on page {page_num}",
                    os.path.join(
                        tmp_dir,
                        filename_without_extension,
                        f"page-{page_num}.png",
                    ),
                ]
            )
    redacted_pages = [
        str(page + 1)
        for page in range(len(results))
        if results[page]["redacted"] > (confidence / 100)
    ]
    report = os.path.join(
        tmp_dir, filename_without_extension, "redacted_pages.pdf"
    )

    if extract_images_checkbox:
        pdf = FPDF(unit="cm", format="A4")
        pdf.set_auto_page_break(0)
        imagelist = sorted(
            [
                i
                for i in os.listdir(
                    os.path.join(tmp_dir, filename_without_extension)
                )
                if i.endswith("png")
            ]
        )
        for image in imagelist:
            with PILImage.open(
                os.path.join(tmp_dir, filename_without_extension, image)
            ) as img:
                size = img.size
                width, height = size
                if width > height:
                    pdf.add_page(orientation="L")
                else:
                    pdf.add_page(orientation="P")
                pred_dict = get_image_predictions(img)

                total_image_areas += pred_dict["width"] * pred_dict["height"]
                total_content_areas += get_content_area(pred_dict)
                total_redaction_area += get_redaction_area(pred_dict)

                pred_dict["img"].save(
                    os.path.join(
                        tmp_dir, filename_without_extension, f"pred-{image}"
                    ),
                )
            pdf.image(
                os.path.join(
                    tmp_dir, filename_without_extension, f"pred-{image}"
                ),
                w=pdf.w,
                h=pdf.h,
            )
        pdf.output(report, "F")

    text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\nThe redacted page numbers were: {', '.join(redacted_pages)}. \n\n"

    if not extract_images_checkbox:
        st.text(text_output)
        # DISPLAY IMAGES
    else:
        total_redaction_proportion = round(
            (total_redaction_area / total_image_areas) * 100, 1
        )
        content_redaction_proportion = round(
            (total_redaction_area / total_content_areas) * 100, 1
        )

        redaction_analysis = f"- {total_redaction_proportion}% of the total area of the redacted pages was redacted. \n- {content_redaction_proportion}% of the actual content of those redacted pages was redacted."

        st.text(text_output + redaction_analysis)
        # DISPLAY IMAGES