import streamlit as st import fitz # PyMuPDF import docx from difflib import HtmlDiff, SequenceMatcher import os import re # Functions to extract text and metadata def extract_text_pdf(file): doc = fitz.open(file) text = "" for page in doc: text += page.get_text() return text def extract_text_word(file): doc = docx.Document(file) text = "\n".join([para.text for para in doc.paragraphs]) return text def extract_metadata_pdf(file): doc = fitz.open(file) metadata = doc.metadata return metadata def extract_metadata_word(file): doc = docx.Document(file) core_props = doc.core_properties metadata = { "author": core_props.author, "created": core_props.created, "modified": core_props.modified } return metadata # Function to compare text using difflib and return highlighted HTML differences def compare_texts(text1, text2): differ = HtmlDiff() return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2) # Function to calculate similarity score def calculate_similarity(text1, text2): matcher = SequenceMatcher(None, text1, text2) return matcher.ratio() # Streamlit App Interface st.title("Document Edit Detection POC") st.write("Upload both the original and edited documents below:") # File upload original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"]) edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"]) # Process if both files are uploaded if original_file and edited_file: # Identify file types original_ext = os.path.splitext(original_file.name)[1] edited_ext = os.path.splitext(edited_file.name)[1] # Check if both files are of the same type if original_ext != edited_ext: st.error("Both documents must be of the same type (PDF or DOCX).") else: # Extract text and metadata if original_ext == ".pdf": original_text = extract_text_pdf(original_file) edited_text = extract_text_pdf(edited_file) original_metadata = extract_metadata_pdf(original_file) edited_metadata = extract_metadata_pdf(edited_file) else: original_text = extract_text_word(original_file) edited_text = extract_text_word(edited_file) original_metadata = extract_metadata_word(original_file) edited_metadata = extract_metadata_word(edited_file) # Display Metadata st.subheader("Metadata Comparison") metadata_match = original_metadata == edited_metadata st.write("Metadata Match:", metadata_match) st.write("Original Document Metadata:") st.write(original_metadata) st.write("Edited Document Metadata:") st.write(edited_metadata) # Compare text st.subheader("Text Comparison") text_diff_html = compare_texts(original_text, edited_text) similarity_score = calculate_similarity(original_text, edited_text) st.write("Similarity Score:", round(similarity_score * 100, 2), "%") text_match = similarity_score == 1.0 st.write("Text Match:", text_match) # Display highlighted text differences st.write("Differences:") st.components.v1.html(text_diff_html, height=400, scrolling=True) # Report Generation st.subheader("Report Summary") st.write("Metadata Match:", metadata_match) st.write("Text Match:", text_match) st.write("Similarity Score:", round(similarity_score * 100, 2), "%") else: st.info("Please upload both the original and edited documents to proceed.")