Redaction_PDF_advanced

Sleeping

App Files Files Community

edithram23 commited on Jun 24, 2024

Commit

7456815

verified ·

1 Parent(s): 51d9845

Upload app.py

Browse files

Files changed (1) hide show

app.py +76 -0

app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from transformers import AutoTokenizer
+from transformers import AutoModelForSeq2SeqLM
+import streamlit as st
+import fitz  # PyMuPDF
+from docx import Document
+import re
+import nltk
+nltk.download('punkt')
+def sentence_tokenize(text):
+    sentences = nltk.sent_tokenize(text)
+    return sentences
+model_dir_large = 'edithram23/Redaction_Personal_info_v1'
+tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
+model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
+def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
+    inputs = ["Mask Generation: " + text+'.']
+    inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
+    output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
+    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
+    predicted_title = decoded_output.strip()
+    pattern = r'\[.*?\]'
+    # Replace all occurrences of the pattern with [redacted]
+    redacted_text = re.sub(pattern, '[redacted]', predicted_title)
+    return redacted_text
+def read_pdf(file):
+    pdf_document = fitz.open(stream=file.read(), filetype="pdf")
+    text = ""
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)
+        text += page.get_text()
+    return text
+def read_docx(file):
+    doc = Document(file)
+    text = "\n".join([para.text for para in doc.paragraphs])
+    return text
+def read_txt(file):
+    text = file.read().decode("utf-8")
+    return text
+def process_file(file):
+    if file.type == "application/pdf":
+        return read_pdf(file)
+    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return read_docx(file)
+    elif file.type == "text/plain":
+        return read_txt(file)
+    else:
+        return "Unsupported file type."
+st.title("File Reader")
+uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
+if uploaded_file is not None:
+    file_contents = process_file(uploaded_file)
+    token = sentence_tokenize(file_contents)
+    final=''
+    for i in range(0, len(token)):
+        final+=mask_generation(token[i])+'\n'
+    processed_text = final
+    st.text_area("File Contents", processed_text, height=400)
+    st.download_button(
+        label="Download Processed File",
+        data=processed_text,
+        file_name="processed_file.txt",
+        mime="text/plain",
+    )