edithram23 commited on
Commit
67ff28f
1 Parent(s): ca0d553

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -82
app.py CHANGED
@@ -28,7 +28,55 @@ model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
28
  # pattern = r'\[.*?\]'
29
  # redacted_text = re.sub(pattern, '[redacted]', predicted_title)
30
  # return redacted_text
31
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
33
  if len(text) < 90:
34
  text = text + '.'
@@ -42,55 +90,6 @@ def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
42
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
43
  return redacted_text
44
 
45
- def find_surrounding_words(text, target="[redacted]"):
46
- pattern = re.compile(r'([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?\s*' + re.escape(target) + r'\s*([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?')
47
- matches = pattern.finditer(text)
48
- results = []
49
- for match in matches:
50
- before, after = match.group(1), match.group(2)
51
-
52
- if before:
53
- before_parts = before.split(',')
54
- before_parts = [item for item in before_parts if item.strip()]
55
- if len(before_parts) > 1:
56
- before_word = before_parts[0].strip()
57
- before_index = match.start(1)
58
- else:
59
- before_word = before_parts[0]
60
- before_index = match.start(1)
61
- else:
62
- before_word = None
63
- before_index = None
64
-
65
- if after:
66
- after_parts = after.split(',')
67
- after_parts = [item for item in after_parts if item.strip()]
68
- if len(after_parts) > 1:
69
- after_word = after_parts[0].strip()
70
- after_index = match.start(2)
71
- else:
72
- after_word = after_parts[0]
73
- after_index = match.start(2)
74
- else:
75
- after_word = None
76
- after_index = None
77
-
78
- if match.start() == 0:
79
- before_word = None
80
- before_index = None
81
-
82
- if match.end() == len(text):
83
- after_word = None
84
- after_index = None
85
-
86
- results.append({
87
- "before_word": before_word,
88
- "after_word": after_word,
89
- "before_index": before_index,
90
- "after_index": after_index
91
- })
92
- return results
93
-
94
  def redact_text(page, text):
95
  text_instances = page.search_for(text)
96
  for inst in text_instances:
@@ -131,38 +130,27 @@ if uploaded_file is not None:
131
  file_contents, pdf_document = process_file(uploaded_file)
132
  if pdf_document:
133
  redacted_text = []
134
- for page in pdf_document:
135
- pg = page.get_text()
136
- pg_lower = pg.lower()
137
- token = sentence_tokenize(pg)
138
- final = ''
139
- for t in token:
140
- t_lower = t.lower()
141
- final = mask_generation(t)
142
- words = find_surrounding_words(final)
143
- for i in range(len(words)):
144
- if words[i]['after_index'] is None:
145
- if words[i]['before_word'] in t_lower:
146
- fi = t_lower.index(words[i]['before_word'])
147
- fi = fi + len(words[i]['before_word'])
148
- li = len(t)
149
- redacted_text.append(t[fi:li])
150
- elif words[i]['before_index'] is None:
151
- if words[i]['after_word'] in t_lower:
152
- fi = 0
153
- li = t_lower.index(words[i]['after_word'])
154
- redacted_text.append(t[fi:li])
155
- else:
156
- if words[i]['after_word'] in t_lower and words[i]['before_word'] in t_lower:
157
- before_word = words[i]['before_word']
158
- after_word = words[i]['after_word']
159
- fi = t_lower.index(before_word)
160
- fi = fi + len(before_word)
161
- li = t_lower.index(after_word)
162
- redacted_text.append(t[fi:li])
163
- for page in pdf_document:
164
- for i in redacted_text:
165
- redact_text(page, i)
166
  output_pdf = "output_redacted.pdf"
167
  pdf_document.save(output_pdf)
168
 
 
28
  # pattern = r'\[.*?\]'
29
  # redacted_text = re.sub(pattern, '[redacted]', predicted_title)
30
  # return redacted_text
31
+ from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern
32
+
33
+ # Initialize the analyzer engine
34
+ analyzer = AnalyzerEngine()
35
+
36
+ # Define a custom address recognizer using a regex pattern
37
+ address_pattern = Pattern(name="address", regex=r"\d+\s\w+\s(?:street|st|road|rd|avenue|ave|lane|ln|drive|dr|blvd|boulevard)\s*\w*", score=0.5)
38
+ address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[address_pattern])
39
+
40
+ # Add the custom address recognizer to the analyzer
41
+ analyzer.registry.add_recognizer(address_recognizer)
42
+ analyzer.get_recognizers
43
+ # Define a function to extract entities
44
+ def extract_entities(text):
45
+ entities = {
46
+ "NAME": [],
47
+ "PHONE_NUMBER": [],
48
+ "EMAIL": [],
49
+ "ADDRESS": [],
50
+ "LOCATION": [],
51
+ "IN_AADHAAR": [],
52
+ }
53
+ output = []
54
+
55
+ # Analyze the text for PII
56
+ results = analyzer.analyze(text=text, language='en')
57
+
58
+ for result in results:
59
+ if result.entity_type == "PERSON":
60
+ entities["NAME"].append(text[result.start:result.end])
61
+ output+=[text[result.start:result.end]]
62
+ elif result.entity_type == "PHONE_NUMBER":
63
+ entities["PHONE_NUMBER"].append(text[result.start:result.end])
64
+ output+=[text[result.start:result.end]]
65
+ elif result.entity_type == "EMAIL_ADDRESS":
66
+ entities["EMAIL"].append(text[result.start:result.end])
67
+ output+=[text[result.start:result.end]]
68
+ elif result.entity_type == "ADDRESS":
69
+ entities["ADDRESS"].append(text[result.start:result.end])
70
+ output+=[text[result.start:result.end]]
71
+ elif result.entity_type == 'LOCATION':
72
+ entities['LOCATION'].append(text[result.start:result.end])
73
+ output+=[text[result.start:result.end]]
74
+ elif result.entity_type == 'IN_AADHAAR':
75
+ entities['IN_PAN'].append(text[result.start:result.end])
76
+ output+=[text[result.start:result.end]]
77
+
78
+ return entities,output
79
+
80
  def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
81
  if len(text) < 90:
82
  text = text + '.'
 
90
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
91
  return redacted_text
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def redact_text(page, text):
94
  text_instances = page.search_for(text)
95
  for inst in text_instances:
 
130
  file_contents, pdf_document = process_file(uploaded_file)
131
  if pdf_document:
132
  redacted_text = []
133
+ for pg in pdf_document:
134
+ text = pg.get_text('text')
135
+ sentences = sentence_tokenize(text)
136
+ for sent in sentences:
137
+ entities,words_out = extract_entities(sent)
138
+ avai_red = pg.search_for(sent)
139
+ new=[]
140
+ for w in words_out:
141
+
142
+ new+=w.split('\n')
143
+ words_out = [i for i in new if len(i)>2]
144
+ print(words_out)
145
+ for i in avai_red:
146
+ b = pg.get_text("text", clip=i)
147
+ # result = [item for item in output if item in b] # Get elements of 'a' that are in 'b'
148
+ for j in words_out:
149
+ new_n = pg.search_for(j, clip=i)
150
+ for all in new_n:
151
+ pg.add_redact_annot(all,fill=(0, 0, 0))
152
+ pg.apply_redactions()
153
+
 
 
 
 
 
 
 
 
 
 
 
154
  output_pdf = "output_redacted.pdf"
155
  pdf_document.save(output_pdf)
156