Chris Finlayson
commited on
Commit
•
319ac5e
1
Parent(s):
3b045d4
Deploy to HF
Browse files- app.py +175 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import fitz
|
4 |
+
import re
|
5 |
+
|
6 |
+
date = re.compile(
|
7 |
+
u"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?)(?:\,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}",
|
8 |
+
re.IGNORECASE,
|
9 |
+
)
|
10 |
+
time = re.compile(u"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?", re.IGNORECASE)
|
11 |
+
phone = re.compile(
|
12 |
+
u"""((?:(?<![\d-])(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4}(?![\d-]))|(?:(?<![\d-])(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4}(?![\d-])))"""
|
13 |
+
)
|
14 |
+
phones_with_exts = re.compile(
|
15 |
+
u"((?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?(?:[0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(?:\d+)?))",
|
16 |
+
re.IGNORECASE,
|
17 |
+
)
|
18 |
+
email = re.compile(
|
19 |
+
u"([a-z0-9!#$%&'*+\/=?^_`{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)",
|
20 |
+
re.IGNORECASE,
|
21 |
+
)
|
22 |
+
ip = re.compile(
|
23 |
+
u"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
|
24 |
+
re.IGNORECASE,
|
25 |
+
)
|
26 |
+
ipv6 = re.compile(
|
27 |
+
u"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*",
|
28 |
+
re.VERBOSE | re.IGNORECASE | re.DOTALL,
|
29 |
+
)
|
30 |
+
|
31 |
+
credit_card = re.compile(u"((?:(?:\\d{4}[- ]?){3}\\d{4}|\\d{15,16}))(?![\\d])")
|
32 |
+
btc_address = re.compile(
|
33 |
+
u"(?<![a-km-zA-HJ-NP-Z0-9])[13][a-km-zA-HJ-NP-Z0-9]{26,33}(?![a-km-zA-HJ-NP-Z0-9])"
|
34 |
+
)
|
35 |
+
street_address = re.compile(
|
36 |
+
u"\d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)",
|
37 |
+
re.IGNORECASE,
|
38 |
+
)
|
39 |
+
zip_code = re.compile(r"\b\d{5}(?:[-\s]\d{4})?\b")
|
40 |
+
po_box = re.compile(r"P\.? ?O\.? Box \d+", re.IGNORECASE)
|
41 |
+
|
42 |
+
postcodes = re.compile("([gG][iI][rR] {0,}0[aA]{2})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2})")
|
43 |
+
ukphones = re.compile("^\s*\(?(020[7,8]{1}\)?[ ]?[1-9]{1}[0-9{2}[ ]?[0-9]{4})|(0[1-8]{1}[0-9]{3}\)?[ ]?[1-9]{1}[0-9]{2}[ ]?[0-9]{3})\s*$")
|
44 |
+
|
45 |
+
regexes = {
|
46 |
+
"dates": date,
|
47 |
+
"times": time,
|
48 |
+
"phones": phone,
|
49 |
+
"phones_with_exts": phones_with_exts,
|
50 |
+
"emails": email,
|
51 |
+
"ips": ip,
|
52 |
+
"ipv6s": ipv6,
|
53 |
+
"credit_cards": credit_card,
|
54 |
+
"btc_addresses": btc_address,
|
55 |
+
"street_addresses": street_address,
|
56 |
+
"zip_codes": zip_code,
|
57 |
+
"po_boxes": po_box,
|
58 |
+
"postcodes": postcodes,
|
59 |
+
"ukphones": ukphones
|
60 |
+
}
|
61 |
+
|
62 |
+
|
63 |
+
class regex:
|
64 |
+
def __init__(self, obj, regex):
|
65 |
+
self.obj = obj
|
66 |
+
self.regex = regex
|
67 |
+
|
68 |
+
def __call__(self, *args):
|
69 |
+
def regex_method(text=None):
|
70 |
+
return [x for x
|
71 |
+
in self.regex.findall(text or self.obj.text)]
|
72 |
+
|
73 |
+
return regex_method
|
74 |
+
|
75 |
+
|
76 |
+
class PiiRegex(object):
|
77 |
+
def __init__(self, text=""):
|
78 |
+
self.text = text
|
79 |
+
|
80 |
+
# Build class attributes of callables.
|
81 |
+
for k, v in regexes.items():
|
82 |
+
setattr(self, k, regex(self, v)(self))
|
83 |
+
|
84 |
+
if text:
|
85 |
+
for key in regexes.keys():
|
86 |
+
method = getattr(self, key)
|
87 |
+
setattr(self, key, method())
|
88 |
+
|
89 |
+
def any_match(self, text=""):
|
90 |
+
"""Scan through all available matches and try to match.
|
91 |
+
"""
|
92 |
+
if text:
|
93 |
+
self.text = text
|
94 |
+
|
95 |
+
# Regenerate class attribute callables.
|
96 |
+
for k, v in regexes.items():
|
97 |
+
setattr(self, k, regex(self, v)(self))
|
98 |
+
for key in regexes.keys():
|
99 |
+
method = getattr(self, key)
|
100 |
+
setattr(self, key, method())
|
101 |
+
|
102 |
+
matches = []
|
103 |
+
for match in regexes.keys():
|
104 |
+
# If we've got a result, add it to matches.
|
105 |
+
if getattr(self, match):
|
106 |
+
matches.append(match)
|
107 |
+
|
108 |
+
return True if matches else False
|
109 |
+
|
110 |
+
|
111 |
+
class Redactor:
|
112 |
+
|
113 |
+
# static methods work independent of class object
|
114 |
+
@staticmethod
|
115 |
+
def get_sensitive_data(lines):
|
116 |
+
sensitive_data = []
|
117 |
+
for line in lines:
|
118 |
+
pii = PiiRegex(line)
|
119 |
+
if pii.any_match():
|
120 |
+
sensitive_data.append(line)
|
121 |
+
return sensitive_data
|
122 |
+
|
123 |
+
# constructor
|
124 |
+
def __init__(self, file):
|
125 |
+
self.file = file
|
126 |
+
|
127 |
+
def redaction(self):
|
128 |
+
|
129 |
+
""" main redactor code """
|
130 |
+
# opening the pdf
|
131 |
+
doc = fitz.open(self.file)
|
132 |
+
# iterating through pages
|
133 |
+
for page in doc:
|
134 |
+
# _wrapContents is needed for fixing
|
135 |
+
# alignment issues with rect boxes in some
|
136 |
+
# cases where there is alignment issue
|
137 |
+
# page._wrapContents()
|
138 |
+
sensitive = self.get_sensitive_data(page.get_text("text")
|
139 |
+
.split('\n'))
|
140 |
+
for data in sensitive:
|
141 |
+
areas = page.search_for(data)
|
142 |
+
# drawing outline over sensitive datas
|
143 |
+
if data:
|
144 |
+
for area in areas:
|
145 |
+
# annot = page.add_redact_annot(area)
|
146 |
+
annot = page.add_redact_annot(area.quad, text='REDACTED', fontname=None, fontsize=11, fill=(1, 1, 1), text_color=(0, 0, 0), cross_out=True)
|
147 |
+
annot.update()
|
148 |
+
# applying the redaction
|
149 |
+
page.apply_redactions()
|
150 |
+
# saving it to a new pdf
|
151 |
+
doc.save('redacted.pdf')
|
152 |
+
print("Successfully redacted")
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
def redact_pdf(file):
|
157 |
+
|
158 |
+
# Load the redaction pipeline
|
159 |
+
redactor = Redactor(file)
|
160 |
+
redactor.redaction()
|
161 |
+
# Return the redacted pdf file
|
162 |
+
return 'redacted.pdf'
|
163 |
+
|
164 |
+
|
165 |
+
inputs = [
|
166 |
+
gr.File(label="Upload PDF")
|
167 |
+
]
|
168 |
+
|
169 |
+
outputs = [
|
170 |
+
gr.File(label="Redacted PDF")
|
171 |
+
]
|
172 |
+
|
173 |
+
description = 'This tool detects and redacts the following types of PII information: dates, times, phone numbers, emails, IP addresses, credit card numbers, Bitcoin addresses, street addresses, zip codes, PO boxes, UK postcodes, and UK phone numbers.'
|
174 |
+
iface = gr.Interface(fn=redact_pdf, inputs=inputs, outputs=outputs, title="PDF Redactor", description=description)
|
175 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
PyMuPDF
|
3 |
+
transformers
|