Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF | |
import docx | |
from difflib import HtmlDiff, SequenceMatcher | |
import os | |
import re | |
# Functions to extract text and metadata | |
def extract_text_pdf(file): | |
doc = fitz.open(file) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def extract_text_word(file): | |
doc = docx.Document(file) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
return text | |
def extract_metadata_pdf(file): | |
doc = fitz.open(file) | |
metadata = doc.metadata | |
return metadata | |
def extract_metadata_word(file): | |
doc = docx.Document(file) | |
core_props = doc.core_properties | |
metadata = { | |
"author": core_props.author, | |
"created": core_props.created, | |
"modified": core_props.modified | |
} | |
return metadata | |
# Function to compare text using difflib and return highlighted HTML differences | |
def compare_texts(text1, text2): | |
differ = HtmlDiff() | |
return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2) | |
# Function to calculate similarity score | |
def calculate_similarity(text1, text2): | |
matcher = SequenceMatcher(None, text1, text2) | |
return matcher.ratio() | |
# Streamlit App Interface | |
st.title("Document Edit Detection POC") | |
st.write("Upload both the original and edited documents below:") | |
# File upload | |
original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"]) | |
edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"]) | |
# Process if both files are uploaded | |
if original_file and edited_file: | |
# Identify file types | |
original_ext = os.path.splitext(original_file.name)[1] | |
edited_ext = os.path.splitext(edited_file.name)[1] | |
# Check if both files are of the same type | |
if original_ext != edited_ext: | |
st.error("Both documents must be of the same type (PDF or DOCX).") | |
else: | |
# Extract text and metadata | |
if original_ext == ".pdf": | |
original_text = extract_text_pdf(original_file) | |
edited_text = extract_text_pdf(edited_file) | |
original_metadata = extract_metadata_pdf(original_file) | |
edited_metadata = extract_metadata_pdf(edited_file) | |
else: | |
original_text = extract_text_word(original_file) | |
edited_text = extract_text_word(edited_file) | |
original_metadata = extract_metadata_word(original_file) | |
edited_metadata = extract_metadata_word(edited_file) | |
# Display Metadata | |
st.subheader("Metadata Comparison") | |
metadata_match = original_metadata == edited_metadata | |
st.write("Metadata Match:", metadata_match) | |
st.write("Original Document Metadata:") | |
st.write(original_metadata) | |
st.write("Edited Document Metadata:") | |
st.write(edited_metadata) | |
# Compare text | |
st.subheader("Text Comparison") | |
text_diff_html = compare_texts(original_text, edited_text) | |
similarity_score = calculate_similarity(original_text, edited_text) | |
st.write("Similarity Score:", round(similarity_score * 100, 2), "%") | |
text_match = similarity_score == 1.0 | |
st.write("Text Match:", text_match) | |
# Display highlighted text differences | |
st.write("Differences:") | |
st.components.v1.html(text_diff_html, height=400, scrolling=True) | |
# Report Generation | |
st.subheader("Report Summary") | |
st.write("Metadata Match:", metadata_match) | |
st.write("Text Match:", text_match) | |
st.write("Similarity Score:", round(similarity_score * 100, 2), "%") | |
else: | |
st.info("Please upload both the original and edited documents to proceed.") | |