Spaces:

chrisfinlayson
/

foundry-pdf-redact

Sleeping

File size: 7,179 Bytes

import gradio as gr
import os
import fitz
import re

date = re.compile(
    u"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?)(?:\,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}",
    re.IGNORECASE,
)
time = re.compile(u"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?", re.IGNORECASE)
phone = re.compile(
    u"""((?:(?<![\d-])(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4}(?![\d-]))|(?:(?<![\d-])(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4}(?![\d-])))"""
)
phones_with_exts = re.compile(
    u"((?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?(?:[0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(?:\d+)?))",
    re.IGNORECASE,
)
email = re.compile(
    u"([a-z0-9!#$%&'*+\/=?^_`{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)",
    re.IGNORECASE,
)
ip = re.compile(
    u"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
    re.IGNORECASE,
)
ipv6 = re.compile(
    u"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*",
    re.VERBOSE | re.IGNORECASE | re.DOTALL,
)

credit_card = re.compile(u"((?:(?:\\d{4}[- ]?){3}\\d{4}|\\d{15,16}))(?![\\d])")
btc_address = re.compile(
    u"(?<![a-km-zA-HJ-NP-Z0-9])[13][a-km-zA-HJ-NP-Z0-9]{26,33}(?![a-km-zA-HJ-NP-Z0-9])"
)
street_address = re.compile(
    u"\d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)",
    re.IGNORECASE,
)
zip_code = re.compile(r"\b\d{5}(?:[-\s]\d{4})?\b")
po_box = re.compile(r"P\.? ?O\.? Box \d+", re.IGNORECASE)

postcodes = re.compile("([gG][iI][rR] {0,}0[aA]{2})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2})")
ukphones = re.compile("^\s*\(?(020[7,8]{1}\)?[ ]?[1-9]{1}[0-9{2}[ ]?[0-9]{4})|(0[1-8]{1}[0-9]{3}\)?[ ]?[1-9]{1}[0-9]{2}[ ]?[0-9]{3})\s*$")

regexes = {
    "dates": date,
    "times": time,
    "phones": phone,
    "phones_with_exts": phones_with_exts,
    "emails": email,
    "ips": ip,
    "ipv6s": ipv6,
    "credit_cards": credit_card,
    "btc_addresses": btc_address,
    "street_addresses": street_address,
    "zip_codes": zip_code,
    "po_boxes": po_box,
    "postcodes": postcodes,
    "ukphones": ukphones
}

"""
This class is used to compile and find all matches of a given regex pattern in a text.
It takes an object and a regex pattern as input.
"""

class regex:
    def __init__(self, obj, regex):
        self.obj = obj
        self.regex = regex

    def __call__(self, *args):
        def regex_method(text=None):
            return [x for x
                    in self.regex.findall(text or self.obj.text)]

        return regex_method

    """
    This class is used to compile and find all matches of a given regex pattern in a text.
    It takes an object and a regex pattern as input.
    """

class PiiRegex(object):
    def __init__(self, text=""):
        self.text = text

        # Build class attributes of callables.
        for k, v in regexes.items():
            setattr(self, k, regex(self, v)(self))

        if text:
            for key in regexes.keys():
                method = getattr(self, key)
                setattr(self, key, method())

    def any_match(self, text=""):
        """Scan through all available matches and try to match.
        """
        if text:
            self.text = text

            # Regenerate class attribute callables.
            for k, v in regexes.items():
                setattr(self, k, regex(self, v)(self))
            for key in regexes.keys():
                method = getattr(self, key)
                setattr(self, key, method())

        matches = []
        for match in regexes.keys():
            # If we've got a result, add it to matches.
            if getattr(self, match):
                print (f"PII located in document: {match}")
                matches.append(match)

        return True if matches else False

    """
    This class is used to redact sensitive information from a PDF file.
    It takes a file as input and redacts all the sensitive information found in the file.
    The redacted file is saved as a new PDF file.
    """

class Redactor:
   
    # static methods work independent of class object
    @staticmethod
    def get_sensitive_data(lines):
        sensitive_data = []
        for line in lines:
            pii = PiiRegex(line)
            if pii.any_match():
                sensitive_data.append(line)
        return sensitive_data
 
    # constructor
    def __init__(self, file):
        self.file = file
 
    def redaction(self):
       
        """ main redactor code """ 
        doc = fitz.open(self.file)       
        for page in doc:
            sensitive = self.get_sensitive_data(page.get_text("text")
                                                .split('\n'))
            for data in sensitive:
                areas = page.search_for(data)
                # drawing outline over sensitive datas
                if data:
                    for area in areas:
                        annot = page.add_redact_annot(area.quad, text='REDACTED', fontname=None, fontsize=11, fill=(1, 1, 1), text_color=(0, 0, 0), cross_out=True)
                        annot.update()
            # applying the redaction
            page.apply_redactions()
        # saving it to a new pdf
        redacted_file = os.path.splitext(self.file.name)[0] + '_redacted.pdf'
        doc.save(redacted_file)
        print(f"Successfully redacted. The redacted file is saved as {redacted_file}")
 


def redact_pdf(file):

    # Load the redaction pipeline
    redactor = Redactor(file)
    redactor.redaction()
    # Return the redacted pdf file
    return os.path.splitext(file.name)[0] + '_redacted.pdf'


inputs = [
    gr.File(label="Upload PDF")
]

outputs = [
    gr.File(label="Redacted PDF")
]

description = 'This tool detects and redacts the following types of PII information: dates, times, phone numbers, emails, IP addresses, credit card numbers, Bitcoin addresses, street addresses, zip codes, PO boxes, UK postcodes, and UK phone numbers.'
iface = gr.Interface(fn=redact_pdf, inputs=inputs, outputs=outputs, title="PDF Redactor", description=description)
iface.launch()