|
import gradio as gr |
|
import os |
|
import fitz |
|
import re |
|
|
|
date = re.compile( |
|
u"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?)(?:\,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}", |
|
re.IGNORECASE, |
|
) |
|
time = re.compile(u"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?", re.IGNORECASE) |
|
phone = re.compile( |
|
u"""((?:(?<![\d-])(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4}(?![\d-]))|(?:(?<![\d-])(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4}(?![\d-])))""" |
|
) |
|
phones_with_exts = re.compile( |
|
u"((?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?(?:[0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(?:\d+)?))", |
|
re.IGNORECASE, |
|
) |
|
email = re.compile( |
|
u"([a-z0-9!#$%&'*+\/=?^_`{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)", |
|
re.IGNORECASE, |
|
) |
|
ip = re.compile( |
|
u"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)", |
|
re.IGNORECASE, |
|
) |
|
ipv6 = re.compile( |
|
u"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*", |
|
re.VERBOSE | re.IGNORECASE | re.DOTALL, |
|
) |
|
|
|
credit_card = re.compile(u"((?:(?:\\d{4}[- ]?){3}\\d{4}|\\d{15,16}))(?![\\d])") |
|
btc_address = re.compile( |
|
u"(?<![a-km-zA-HJ-NP-Z0-9])[13][a-km-zA-HJ-NP-Z0-9]{26,33}(?![a-km-zA-HJ-NP-Z0-9])" |
|
) |
|
street_address = re.compile( |
|
u"\d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)", |
|
re.IGNORECASE, |
|
) |
|
zip_code = re.compile(r"\b\d{5}(?:[-\s]\d{4})?\b") |
|
po_box = re.compile(r"P\.? ?O\.? Box \d+", re.IGNORECASE) |
|
|
|
postcodes = re.compile("([gG][iI][rR] {0,}0[aA]{2})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2})") |
|
ukphones = re.compile("^\s*\(?(020[7,8]{1}\)?[ ]?[1-9]{1}[0-9{2}[ ]?[0-9]{4})|(0[1-8]{1}[0-9]{3}\)?[ ]?[1-9]{1}[0-9]{2}[ ]?[0-9]{3})\s*$") |
|
|
|
regexes = { |
|
"dates": date, |
|
"times": time, |
|
"phones": phone, |
|
"phones_with_exts": phones_with_exts, |
|
"emails": email, |
|
"ips": ip, |
|
"ipv6s": ipv6, |
|
"credit_cards": credit_card, |
|
"btc_addresses": btc_address, |
|
"street_addresses": street_address, |
|
"zip_codes": zip_code, |
|
"po_boxes": po_box, |
|
"postcodes": postcodes, |
|
"ukphones": ukphones |
|
} |
|
|
|
""" |
|
This class is used to compile and find all matches of a given regex pattern in a text. |
|
It takes an object and a regex pattern as input. |
|
""" |
|
|
|
class regex: |
|
def __init__(self, obj, regex): |
|
self.obj = obj |
|
self.regex = regex |
|
|
|
def __call__(self, *args): |
|
def regex_method(text=None): |
|
return [x for x |
|
in self.regex.findall(text or self.obj.text)] |
|
|
|
return regex_method |
|
|
|
""" |
|
This class is used to compile and find all matches of a given regex pattern in a text. |
|
It takes an object and a regex pattern as input. |
|
""" |
|
|
|
class PiiRegex(object): |
|
def __init__(self, text=""): |
|
self.text = text |
|
|
|
|
|
for k, v in regexes.items(): |
|
setattr(self, k, regex(self, v)(self)) |
|
|
|
if text: |
|
for key in regexes.keys(): |
|
method = getattr(self, key) |
|
setattr(self, key, method()) |
|
|
|
def any_match(self, text=""): |
|
"""Scan through all available matches and try to match. |
|
""" |
|
if text: |
|
self.text = text |
|
|
|
|
|
for k, v in regexes.items(): |
|
setattr(self, k, regex(self, v)(self)) |
|
for key in regexes.keys(): |
|
method = getattr(self, key) |
|
setattr(self, key, method()) |
|
|
|
matches = [] |
|
for match in regexes.keys(): |
|
|
|
if getattr(self, match): |
|
print (f"PII located in document: {match}") |
|
matches.append(match) |
|
|
|
return True if matches else False |
|
|
|
""" |
|
This class is used to redact sensitive information from a PDF file. |
|
It takes a file as input and redacts all the sensitive information found in the file. |
|
The redacted file is saved as a new PDF file. |
|
""" |
|
|
|
class Redactor: |
|
|
|
|
|
@staticmethod |
|
def get_sensitive_data(lines): |
|
sensitive_data = [] |
|
for line in lines: |
|
pii = PiiRegex(line) |
|
if pii.any_match(): |
|
sensitive_data.append(line) |
|
return sensitive_data |
|
|
|
|
|
def __init__(self, file): |
|
self.file = file |
|
|
|
def redaction(self): |
|
|
|
""" main redactor code """ |
|
doc = fitz.open(self.file) |
|
for page in doc: |
|
sensitive = self.get_sensitive_data(page.get_text("text") |
|
.split('\n')) |
|
for data in sensitive: |
|
areas = page.search_for(data) |
|
|
|
if data: |
|
for area in areas: |
|
annot = page.add_redact_annot(area.quad, text='REDACTED', fontname=None, fontsize=11, fill=(1, 1, 1), text_color=(0, 0, 0), cross_out=True) |
|
annot.update() |
|
|
|
page.apply_redactions() |
|
|
|
redacted_file = os.path.splitext(self.file.name)[0] + '_redacted.pdf' |
|
doc.save(redacted_file) |
|
print(f"Successfully redacted. The redacted file is saved as {redacted_file}") |
|
|
|
|
|
|
|
def redact_pdf(file): |
|
|
|
|
|
redactor = Redactor(file) |
|
redactor.redaction() |
|
|
|
return os.path.splitext(file.name)[0] + '_redacted.pdf' |
|
|
|
|
|
inputs = [ |
|
gr.File(label="Upload PDF") |
|
] |
|
|
|
outputs = [ |
|
gr.File(label="Redacted PDF") |
|
] |
|
|
|
description = 'This tool detects and redacts the following types of PII information: dates, times, phone numbers, emails, IP addresses, credit card numbers, Bitcoin addresses, street addresses, zip codes, PO boxes, UK postcodes, and UK phone numbers.' |
|
iface = gr.Interface(fn=redact_pdf, inputs=inputs, outputs=outputs, title="PDF Redactor", description=description) |
|
iface.launch() |
|
|