from io import BytesIO import os import pathlib import tempfile import time import fitz import gradio as gr import PIL import skimage import streamlit as st from fastai.learner import load_learner from fastai.vision.all import * from fpdf import FPDF from icevision.all import * from icevision.models.checkpoint import * from PIL import Image as PILImage CHECKPOINT_PATH = "./allsynthetic-imgsize768.pth" @st.cache def load_icevision_model(): return model_from_checkpoint(CHECKPOINT_PATH) @st.cache def load_fastai_model(): return load_learner("fastai-classification-model.pkl") checkpoint_and_model = load_icevision_model() model = checkpoint_and_model["model"] model_type = checkpoint_and_model["model_type"] class_map = checkpoint_and_model["class_map"] img_size = checkpoint_and_model["img_size"] valid_tfms = tfms.A.Adapter( [*tfms.A.resize_and_pad(img_size), tfms.A.Normalize()] ) learn = load_fastai_model() labels = learn.dls.vocab @st.experimental_memo def get_content_area(pred_dict) -> int: if "content" not in pred_dict["detection"]["labels"]: return 0 content_bboxes = [ pred_dict["detection"]["bboxes"][idx] for idx, label in enumerate(pred_dict["detection"]["labels"]) if label == "content" ] cb = content_bboxes[0] return (cb.xmax - cb.xmin) * (cb.ymax - cb.ymin) @st.experimental_memo def get_redaction_area(pred_dict) -> int: if "redaction" not in pred_dict["detection"]["labels"]: return 0 redaction_bboxes = [ pred_dict["detection"]["bboxes"][idx] for idx, label in enumerate(pred_dict["detection"]["labels"]) if label == "redaction" ] return sum( (bbox.xmax - bbox.xmin) * (bbox.ymax - bbox.ymin) for bbox in redaction_bboxes ) st.title("Redaction Detector") st.image( "./synthetic-redactions.jpg", width=300, ) uploaded_pdf = st.file_uploader( "Upload a PDF...", type="pdf", accept_multiple_files=False, help="This application processes PDF files. Please upload a document you believe to contain redactions.", on_change=None, ) # Add a selectbox to the sidebar: st.sidebar.header("Customisation Options") graph_checkbox = st.sidebar.checkbox( "Show analysis charts", value=True, help="Display charts analysising the redactions found in the document.", ) extract_images_checkbox = st.sidebar.checkbox( "Extract redacted images", value=True, help="Create a PDF file containing the redacted images with an object detection overlay highlighting their locations and the confidence the model had when detecting the redactions.", ) # Add a slider to the sidebar: confidence = st.sidebar.slider( "Confidence level (%)", min_value=0, max_value=100, value=80, ) @st.cache def get_pdf_document(input): with open( pathlib.Path(filename_without_extension / "output.pdf"), "wb" ) as f: f.write(uploaded_pdf.getbuffer()) return fitz.open("output.pdf") @st.cache def get_image_predictions(img): return model_type.end2end_detect( img, valid_tfms, model, class_map=class_map, detection_threshold=confidence / 100, display_label=True, display_bbox=True, return_img=True, font_size=16, label_color="#FF59D6", ) if uploaded_pdf is None: st.markdown(pathlib.Path("article.md").read_text()) else: st.text("Opening PDF...") filename_without_extension = uploaded_pdf.name[:-4] results = [] images = [] document = get_pdf_document(uploaded_pdf) total_image_areas = 0 total_content_areas = 0 total_redaction_area = 0 tmp_dir = tempfile.gettempdir() for page_num, page in enumerate(document, start=1): image_pixmap = page.get_pixmap() image = image_pixmap.tobytes() _, _, probs = learn.predict(image) results.append( {labels[i]: float(probs[i]) for i in range(len(labels))} ) if probs[0] > (confidence / 100): redaction_count = len(images) if not os.path.exists( os.path.join(tmp_dir, filename_without_extension or "abc") ): os.makedirs(os.path.join(tmp_dir, filename_without_extension)) image_pixmap.save( os.path.join( tmp_dir, filename_without_extension, f"page-{page_num}.png" ) ) images.append( [ f"Redacted page #{redaction_count + 1} on page {page_num}", os.path.join( tmp_dir, filename_without_extension, f"page-{page_num}.png", ), ] ) redacted_pages = [ str(page + 1) for page in range(len(results)) if results[page]["redacted"] > (confidence / 100) ] report = os.path.join( tmp_dir, filename_without_extension, "redacted_pages.pdf" ) if extract_images_checkbox: pdf = FPDF(unit="cm", format="A4") pdf.set_auto_page_break(0) imagelist = sorted( [ i for i in os.listdir( os.path.join(tmp_dir, filename_without_extension) ) if i.endswith("png") ] ) for image in imagelist: with PILImage.open( os.path.join(tmp_dir, filename_without_extension, image) ) as img: size = img.size width, height = size if width > height: pdf.add_page(orientation="L") else: pdf.add_page(orientation="P") pred_dict = get_image_predictions(img) total_image_areas += pred_dict["width"] * pred_dict["height"] total_content_areas += get_content_area(pred_dict) total_redaction_area += get_redaction_area(pred_dict) pred_dict["img"].save( os.path.join( tmp_dir, filename_without_extension, f"pred-{image}" ), ) pdf.image( os.path.join( tmp_dir, filename_without_extension, f"pred-{image}" ), w=pdf.w, h=pdf.h, ) pdf.output(report, "F") text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\nThe redacted page numbers were: {', '.join(redacted_pages)}. \n\n" if not extract_images_checkbox: st.text(text_output) # DISPLAY IMAGES else: total_redaction_proportion = round( (total_redaction_area / total_image_areas) * 100, 1 ) content_redaction_proportion = round( (total_redaction_area / total_content_areas) * 100, 1 ) redaction_analysis = f"- {total_redaction_proportion}% of the total area of the redacted pages was redacted. \n- {content_redaction_proportion}% of the actual content of those redacted pages was redacted." st.text(text_output + redaction_analysis) # DISPLAY IMAGES