import streamlit as st import fitz # PyMuPDF import docx from difflib import HtmlDiff, SequenceMatcher import os # Directory to save uploaded files UPLOAD_DIR = "uploaded_files" if not os.path.exists(UPLOAD_DIR): os.makedirs(UPLOAD_DIR) # Functions to save, extract text, and metadata def save_uploaded_file(uploaded_file): file_path = os.path.join(UPLOAD_DIR, uploaded_file.name) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) return file_path def extract_text_pdf(file_path): doc = fitz.open(file_path) text = "" for page in doc: text += page.get_text() return text def extract_text_word(file_path): doc = docx.Document(file_path) text = "\n".join([para.text for para in doc.paragraphs]) return text def extract_metadata_pdf(file_path): doc = fitz.open(file_path) metadata = doc.metadata return metadata def extract_metadata_word(file_path): doc = docx.Document(file_path) core_props = doc.core_properties metadata = { "author": core_props.author, "created": core_props.created, "modified": core_props.modified } return metadata # Function to compare text and return highlighted HTML differences def compare_texts(text1, text2): differ = HtmlDiff() return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2) # Function to calculate similarity score def calculate_similarity(text1, text2): matcher = SequenceMatcher(None, text1, text2) return matcher.ratio() # Streamlit App Interface st.title("Document Edit Detection POC") st.write("Upload both the original and edited documents below:") # File upload original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"]) edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"]) # Process if both files are uploaded if original_file and edited_file: # Save uploaded files original_file_path = save_uploaded_file(original_file) edited_file_path = save_uploaded_file(edited_file) # Identify file types original_ext = os.path.splitext(original_file.name)[1] edited_ext = os.path.splitext(edited_file.name)[1] # Check if both files are of the same type if original_ext != edited_ext: st.error("Both documents must be of the same type (PDF or DOCX).") else: # Extract text and metadata if original_ext == ".pdf": original_text = extract_text_pdf(original_file_path) edited_text = extract_text_pdf(edited_file_path) original_metadata = extract_metadata_pdf(original_file_path) edited_metadata = extract_metadata_pdf(edited_file_path) else: original_text = extract_text_word(original_file_path) edited_text = extract_text_word(edited_file_path) original_metadata = extract_metadata_word(original_file_path) edited_metadata = extract_metadata_word(edited_file_path) # Display Metadata st.subheader("Metadata Comparison") metadata_match = original_metadata == edited_metadata st.write("Metadata Match:", metadata_match) st.write("Original Document Metadata:") st.write(original_metadata) st.write("Edited Document Metadata:") st.write(edited_metadata) # Compare text st.subheader("Text Comparison") text_diff_html = compare_texts(original_text, edited_text) similarity_score = calculate_similarity(original_text, edited_text) st.write("Similarity Score:", round(similarity_score * 100, 2), "%") text_match = similarity_score == 1.0 st.write("Text Match:", text_match) # Display highlighted text differences st.write("Differences:") st.components.v1.html(text_diff_html, height=400, scrolling=True) # Report Generation st.subheader("Report Summary") st.write("Metadata Match:", metadata_match) st.write("Text Match:", text_match) st.write("Similarity Score:", round(similarity_score * 100, 2), "%") else: st.info("Please upload both the original and edited documents to proceed.")