Chris Finlayson commited on
Commit
319ac5e
1 Parent(s): 3b045d4

Deploy to HF

Browse files
Files changed (2) hide show
  1. app.py +175 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import fitz
4
+ import re
5
+
6
+ date = re.compile(
7
+ u"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?)(?:\,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}",
8
+ re.IGNORECASE,
9
+ )
10
+ time = re.compile(u"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?", re.IGNORECASE)
11
+ phone = re.compile(
12
+ u"""((?:(?<![\d-])(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4}(?![\d-]))|(?:(?<![\d-])(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4}(?![\d-])))"""
13
+ )
14
+ phones_with_exts = re.compile(
15
+ u"((?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?(?:[0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(?:\d+)?))",
16
+ re.IGNORECASE,
17
+ )
18
+ email = re.compile(
19
+ u"([a-z0-9!#$%&'*+\/=?^_`{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)",
20
+ re.IGNORECASE,
21
+ )
22
+ ip = re.compile(
23
+ u"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
24
+ re.IGNORECASE,
25
+ )
26
+ ipv6 = re.compile(
27
+ u"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*",
28
+ re.VERBOSE | re.IGNORECASE | re.DOTALL,
29
+ )
30
+
31
+ credit_card = re.compile(u"((?:(?:\\d{4}[- ]?){3}\\d{4}|\\d{15,16}))(?![\\d])")
32
+ btc_address = re.compile(
33
+ u"(?<![a-km-zA-HJ-NP-Z0-9])[13][a-km-zA-HJ-NP-Z0-9]{26,33}(?![a-km-zA-HJ-NP-Z0-9])"
34
+ )
35
+ street_address = re.compile(
36
+ u"\d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)",
37
+ re.IGNORECASE,
38
+ )
39
+ zip_code = re.compile(r"\b\d{5}(?:[-\s]\d{4})?\b")
40
+ po_box = re.compile(r"P\.? ?O\.? Box \d+", re.IGNORECASE)
41
+
42
+ postcodes = re.compile("([gG][iI][rR] {0,}0[aA]{2})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2})")
43
+ ukphones = re.compile("^\s*\(?(020[7,8]{1}\)?[ ]?[1-9]{1}[0-9{2}[ ]?[0-9]{4})|(0[1-8]{1}[0-9]{3}\)?[ ]?[1-9]{1}[0-9]{2}[ ]?[0-9]{3})\s*$")
44
+
45
+ regexes = {
46
+ "dates": date,
47
+ "times": time,
48
+ "phones": phone,
49
+ "phones_with_exts": phones_with_exts,
50
+ "emails": email,
51
+ "ips": ip,
52
+ "ipv6s": ipv6,
53
+ "credit_cards": credit_card,
54
+ "btc_addresses": btc_address,
55
+ "street_addresses": street_address,
56
+ "zip_codes": zip_code,
57
+ "po_boxes": po_box,
58
+ "postcodes": postcodes,
59
+ "ukphones": ukphones
60
+ }
61
+
62
+
63
+ class regex:
64
+ def __init__(self, obj, regex):
65
+ self.obj = obj
66
+ self.regex = regex
67
+
68
+ def __call__(self, *args):
69
+ def regex_method(text=None):
70
+ return [x for x
71
+ in self.regex.findall(text or self.obj.text)]
72
+
73
+ return regex_method
74
+
75
+
76
+ class PiiRegex(object):
77
+ def __init__(self, text=""):
78
+ self.text = text
79
+
80
+ # Build class attributes of callables.
81
+ for k, v in regexes.items():
82
+ setattr(self, k, regex(self, v)(self))
83
+
84
+ if text:
85
+ for key in regexes.keys():
86
+ method = getattr(self, key)
87
+ setattr(self, key, method())
88
+
89
+ def any_match(self, text=""):
90
+ """Scan through all available matches and try to match.
91
+ """
92
+ if text:
93
+ self.text = text
94
+
95
+ # Regenerate class attribute callables.
96
+ for k, v in regexes.items():
97
+ setattr(self, k, regex(self, v)(self))
98
+ for key in regexes.keys():
99
+ method = getattr(self, key)
100
+ setattr(self, key, method())
101
+
102
+ matches = []
103
+ for match in regexes.keys():
104
+ # If we've got a result, add it to matches.
105
+ if getattr(self, match):
106
+ matches.append(match)
107
+
108
+ return True if matches else False
109
+
110
+
111
+ class Redactor:
112
+
113
+ # static methods work independent of class object
114
+ @staticmethod
115
+ def get_sensitive_data(lines):
116
+ sensitive_data = []
117
+ for line in lines:
118
+ pii = PiiRegex(line)
119
+ if pii.any_match():
120
+ sensitive_data.append(line)
121
+ return sensitive_data
122
+
123
+ # constructor
124
+ def __init__(self, file):
125
+ self.file = file
126
+
127
+ def redaction(self):
128
+
129
+ """ main redactor code """
130
+ # opening the pdf
131
+ doc = fitz.open(self.file)
132
+ # iterating through pages
133
+ for page in doc:
134
+ # _wrapContents is needed for fixing
135
+ # alignment issues with rect boxes in some
136
+ # cases where there is alignment issue
137
+ # page._wrapContents()
138
+ sensitive = self.get_sensitive_data(page.get_text("text")
139
+ .split('\n'))
140
+ for data in sensitive:
141
+ areas = page.search_for(data)
142
+ # drawing outline over sensitive datas
143
+ if data:
144
+ for area in areas:
145
+ # annot = page.add_redact_annot(area)
146
+ annot = page.add_redact_annot(area.quad, text='REDACTED', fontname=None, fontsize=11, fill=(1, 1, 1), text_color=(0, 0, 0), cross_out=True)
147
+ annot.update()
148
+ # applying the redaction
149
+ page.apply_redactions()
150
+ # saving it to a new pdf
151
+ doc.save('redacted.pdf')
152
+ print("Successfully redacted")
153
+
154
+
155
+
156
+ def redact_pdf(file):
157
+
158
+ # Load the redaction pipeline
159
+ redactor = Redactor(file)
160
+ redactor.redaction()
161
+ # Return the redacted pdf file
162
+ return 'redacted.pdf'
163
+
164
+
165
+ inputs = [
166
+ gr.File(label="Upload PDF")
167
+ ]
168
+
169
+ outputs = [
170
+ gr.File(label="Redacted PDF")
171
+ ]
172
+
173
+ description = 'This tool detects and redacts the following types of PII information: dates, times, phone numbers, emails, IP addresses, credit card numbers, Bitcoin addresses, street addresses, zip codes, PO boxes, UK postcodes, and UK phone numbers.'
174
+ iface = gr.Interface(fn=redact_pdf, inputs=inputs, outputs=outputs, title="PDF Redactor", description=description)
175
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ PyMuPDF
3
+ transformers