from transformers import pipeline from transformers import AutoTokenizer from transformers import AutoModelForSeq2SeqLM import streamlit as st import fitz # PyMuPDF from docx import Document import re import nltk from nltk import word_tokenize from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern nltk.download('punkt') def sentence_tokenize(text): sentences = nltk.sent_tokenize(text) return sentences model_dir_large = 'edithram23/Redaction_Personal_info_v1' tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large) model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large) pipe1 = pipeline("token-classification", model="edithram23/new-bert-v2") # model_dir_small = 'edithram23/Redaction' # tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small) # model_small = AutoModelForSeq2SeqLM.from_pretrained(model_dir_small) # def small(text, model=model_small, tokenizer=tokenizer_small): # inputs = ["Mask Generation: " + text.lower() + '.'] # inputs = tokenizer(inputs, max_length=256, truncation=True, return_tensors="pt") # output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text)) # decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0] # predicted_title = decoded_output.strip() # pattern = r'\[.*?\]' # redacted_text = re.sub(pattern, '[redacted]', predicted_title) # return redacted_text # Initialize the analyzer engine analyzer = AnalyzerEngine() # Define a custom address recognizer using a regex pattern address_pattern = Pattern(name="address", regex=r"\d+\s\w+\s(?:street|st|road|rd|avenue|ave|lane|ln|drive|dr|blvd|boulevard)\s*\w*", score=0.5) address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[address_pattern]) # Add the custom address recognizer to the analyzer analyzer.registry.add_recognizer(address_recognizer) # analyzer.get_recognizers # Define a function to extract entities def combine_words(entities): combined_entities = [] current_entity = None for entity in entities: if current_entity: if current_entity['end'] == entity['start']: # Combine the words without space current_entity['word'] += entity['word'].replace('##', '') current_entity['end'] = entity['end'] elif current_entity['end'] + 1 == entity['start']: # Combine the words with a space current_entity['word'] += ' ' + entity['word'].replace('##', '') current_entity['end'] = entity['end'] else: # Add the previous combined entity to the list combined_entities.append(current_entity) # Start a new entity current_entity = entity.copy() current_entity['word'] = current_entity['word'].replace('##', '') else: # Initialize the first entity current_entity = entity.copy() current_entity['word'] = current_entity['word'].replace('##', '') # Add the last entity if current_entity: combined_entities.append(current_entity) return combined_entities def words_red_bert(text): final=[] sentences = sentence_tokenize(text) for sentence in sentences: x=[pipe1(sentence)] m = combine_words(x[0]) for j in m: if(j['entity']!='none' and len(j['word'])>1 and j['word']!=', '): final.append(j['word']) return final def extract_entities(text): entities = { "NAME": [], "PHONE_NUMBER": [], "EMAIL": [], "ADDRESS": [], "LOCATION": [], "IN_AADHAAR": [], } output = [] # Analyze the text for PII results = analyzer.analyze(text=text, language='en') for result in results: if result.entity_type == "PERSON": entities["NAME"].append(text[result.start:result.end]) output+=[text[result.start:result.end]] elif result.entity_type == "PHONE_NUMBER": entities["PHONE_NUMBER"].append(text[result.start:result.end]) output+=[text[result.start:result.end]] elif result.entity_type == "EMAIL_ADDRESS": entities["EMAIL"].append(text[result.start:result.end]) output+=[text[result.start:result.end]] elif result.entity_type == "ADDRESS": entities["ADDRESS"].append(text[result.start:result.end]) output+=[text[result.start:result.end]] elif result.entity_type == 'LOCATION': entities['LOCATION'].append(text[result.start:result.end]) output+=[text[result.start:result.end]] elif result.entity_type == 'IN_AADHAAR': entities['IN_PAN'].append(text[result.start:result.end]) output+=[text[result.start:result.end]] return entities,output def mask_generation(text, model=model_large, tokenizer=tokenizer_large): if len(text) < 90: text = text + '.' # return small(text) inputs = ["Mask Generation: " + text.lower() + '.'] inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt") output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text)) decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0] predicted_title = decoded_output.strip() pattern = r'\[.*?\]' redacted_text = re.sub(pattern, '[redacted]', predicted_title) return redacted_text def redact_text(page, text): text_instances = page.search_for(text) for inst in text_instances: page.add_redact_annot(inst, fill=(0, 0, 0)) page.apply_redactions() def read_pdf(file): pdf_document = fitz.open(stream=file.read(), filetype="pdf") text = "" for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) text += page.get_text() return text, pdf_document def read_docx(file): doc = Document(file) text = "\n".join([para.text for para in doc.paragraphs]) return text def read_txt(file): text = file.read().decode("utf-8") return text def process_file(file): if file.type == "application/pdf": return read_pdf(file) elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return read_docx(file), None elif file.type == "text/plain": return read_txt(file), None else: return "Unsupported file type.", None st.title("Redaction") uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"]) if uploaded_file is not None: file_contents, pdf_document = process_file(uploaded_file) if pdf_document: redacted_text = '' for pg in pdf_document: text = pg.get_text() sentences = sentence_tokenize(text) for sent in sentences: x = mask_generation(sent) sent_words = word_tokenize(sent.lower()) t5_words = word_tokenize(x.lower()) t5_words=list(set(sent_words).difference(set(t5_words))) entities,words_out = extract_entities(sent) # print("\microsoft:",words_out) # print("\nT5",t5_words) # print("X:",x,"\nsent:",sent,"\nx_q:",x_q,"\nsent_n:",sent_n,"\ne:",e,"\nsent_n_q_c:",sent_n_q_c,'\nt5_words',t5_words) bert_words = words_red_bert(sent) words_out+=t5_words # print("\nbert:",bert_words) new=[] for w in words_out: new+=w.split('\n') # new+=t5_words new+=bert_words words_out = [i for i in new if len(i)>3] # print("\nfinal:",words_out) words_out=sorted(words_out, key=len,reverse=True) for i in words_out: redact_text(pg,i) # st.text_area(redacted_text) output_pdf = "output_redacted.pdf" pdf_document.save(output_pdf) with open(output_pdf, "rb") as file: st.download_button( label="Download Processed PDF", data=file, file_name="processed_file.pdf", mime="application/pdf", ) else: token = sentence_tokenize(file_contents) final = '' for i in range(0, len(token)): final += mask_generation(token[i]) + '\n' processed_text = final st.text_area("OUTPUT", processed_text, height=400) st.download_button( label="Download Processed File", data=processed_text, file_name="processed_file.txt", mime="text/plain", )