documentAnalyzer / main.py
RenAzum's picture
Document Analysis
d84f60a
raw
history blame
3.7 kB
import streamlit as st
import fitz # PyMuPDF
import docx
from difflib import HtmlDiff, SequenceMatcher
import os
import re
# Functions to extract text and metadata
def extract_text_pdf(file):
doc = fitz.open(file)
text = ""
for page in doc:
text += page.get_text()
return text
def extract_text_word(file):
doc = docx.Document(file)
text = "\n".join([para.text for para in doc.paragraphs])
return text
def extract_metadata_pdf(file):
doc = fitz.open(file)
metadata = doc.metadata
return metadata
def extract_metadata_word(file):
doc = docx.Document(file)
core_props = doc.core_properties
metadata = {
"author": core_props.author,
"created": core_props.created,
"modified": core_props.modified
}
return metadata
# Function to compare text using difflib and return highlighted HTML differences
def compare_texts(text1, text2):
differ = HtmlDiff()
return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)
# Function to calculate similarity score
def calculate_similarity(text1, text2):
matcher = SequenceMatcher(None, text1, text2)
return matcher.ratio()
# Streamlit App Interface
st.title("Document Edit Detection POC")
st.write("Upload both the original and edited documents below:")
# File upload
original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"])
edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])
# Process if both files are uploaded
if original_file and edited_file:
# Identify file types
original_ext = os.path.splitext(original_file.name)[1]
edited_ext = os.path.splitext(edited_file.name)[1]
# Check if both files are of the same type
if original_ext != edited_ext:
st.error("Both documents must be of the same type (PDF or DOCX).")
else:
# Extract text and metadata
if original_ext == ".pdf":
original_text = extract_text_pdf(original_file)
edited_text = extract_text_pdf(edited_file)
original_metadata = extract_metadata_pdf(original_file)
edited_metadata = extract_metadata_pdf(edited_file)
else:
original_text = extract_text_word(original_file)
edited_text = extract_text_word(edited_file)
original_metadata = extract_metadata_word(original_file)
edited_metadata = extract_metadata_word(edited_file)
# Display Metadata
st.subheader("Metadata Comparison")
metadata_match = original_metadata == edited_metadata
st.write("Metadata Match:", metadata_match)
st.write("Original Document Metadata:")
st.write(original_metadata)
st.write("Edited Document Metadata:")
st.write(edited_metadata)
# Compare text
st.subheader("Text Comparison")
text_diff_html = compare_texts(original_text, edited_text)
similarity_score = calculate_similarity(original_text, edited_text)
st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
text_match = similarity_score == 1.0
st.write("Text Match:", text_match)
# Display highlighted text differences
st.write("Differences:")
st.components.v1.html(text_diff_html, height=400, scrolling=True)
# Report Generation
st.subheader("Report Summary")
st.write("Metadata Match:", metadata_match)
st.write("Text Match:", text_match)
st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
else:
st.info("Please upload both the original and edited documents to proceed.")