Spaces:
Running
Running
File size: 5,392 Bytes
8850a9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import streamlit as st
from Final_file import FlairRecognizer
import os
import PyPDF2
import docx
# from io import BytesIO
from fpdf import FPDF
import io
from docx import Document
# Cache the model loading and prediction function
@st.cache_resource
def cached_predict_ner_tags(text):
return FlairRecognizer.predict_ner_tags(text)
# Cache the text analysis function
@st.cache_resource
def cached_analyze_text(text):
return FlairRecognizer.analyze_text(text)
def download_masked_file(masked_text, file_extension):
# Create a temporary file to store the masked text
temp_file_path = f"masked_output.{file_extension}"
with open(temp_file_path, "w") as temp_file:
temp_file.write(masked_text)
# Display a download button
st.download_button("Download Masked File", temp_file_path, file_name=f"masked_output.{file_extension}")
# Clean up the temporary file
os.remove(temp_file_path)
def extract_text_from_pdf(file_contents):
try:
# base64_pdf = base64.b64encode(file_contents.read()).decode('utf-8')
pdf_reader = PyPDF2.PdfReader(file_contents)
text = ''
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
except Exception as e:
return f"Error occurred: {str(e)}"
def create_pdf(text_content):
pdf = FPDF()
pdf.add_page()
pdf.add_font("DejaVuSans", "", "DejaVuSans.ttf",uni=True) # Add DejaVuSans font
pdf.set_font("DejaVuSans", size=12)
pdf.multi_cell(0, 10, txt=text_content)
return pdf
def create_word_file(text_content):
doc = Document()
doc.add_paragraph(text_content)
# Save the document to a BytesIO object
doc_io = io.BytesIO()
doc.save(doc_io)
doc_io.seek(0)
return doc_io
def main():
st.title('PII Masking App')
st.sidebar.header('Upload Options')
upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
# # Dropdown menu with four choices
# st.sidebar.header('Masking Options')
# choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
masked_text_public = ''
if upload_option == 'Text Input':
input_text = st.text_area("Enter text here:")
if st.button('Analyze'):
with st.spinner('Wait for it... the model is loading'):
cached_predict_ner_tags(input_text)
masked_text = cached_analyze_text(input_text)
st.text_area("Masked text:", value=masked_text, height=200)
elif upload_option == 'File Upload':
uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
if uploaded_file is not None:
file_contents = uploaded_file.read()
# Process PDF file
if uploaded_file.type == 'application/pdf':
extracted_text = extract_text_from_pdf(uploaded_file)
if st.button('Analyze'):
with st.spinner('Wait for it... the model is loading'):
cached_predict_ner_tags(extracted_text)
masked_text = cached_analyze_text(extracted_text)
st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
if extracted_text:
pdf = create_pdf(masked_text)
# Save PDF to temporary location
pdf_file_path = "masked_output.pdf"
pdf.output(pdf_file_path)
# Download button
st.download_button(label="Download", data=open(pdf_file_path, "rb"), file_name="masked_output.pdf", mime="application/pdf")
else:
st.warning("Please enter some text to download as PDF.")
# Process Word document
elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
doc = docx.Document(io.BytesIO(file_contents))
text = ''
for paragraph in doc.paragraphs:
text += paragraph.text
if st.button('Analyze'):
with st.spinner('Wait for it... the model is loading'):
cached_predict_ner_tags(text)
masked_text = cached_analyze_text(text)
st.text_area("Masked text:", value=masked_text, height=200)
#create word file
doc_io = create_word_file(masked_text)
#download it
st.download_button(label="Download", data=doc_io, file_name="masked_text.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
else:
if st.button('Analyze'):
with st.spinner('Wait for it... the model is loading'):
cached_predict_ner_tags(file_contents.decode())
masked_text = cached_analyze_text(file_contents.decode())
st.text_area("Masked text:", value=masked_text, height=200)
st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")
if __name__ == "__main__":
main() |