Spaces:

RenAzum
/

documentAnalyzer

Sleeping

App Files Files Community

documentAnalyzer / main.py

RenAzum

Document Analysis

d84f60a about 2 months ago

raw

history blame

3.7 kB

	import streamlit as st
	import fitz # PyMuPDF
	import docx
	from difflib import HtmlDiff, SequenceMatcher
	import os
	import re

	# Functions to extract text and metadata
	def extract_text_pdf(file):
	doc = fitz.open(file)
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	def extract_text_word(file):
	doc = docx.Document(file)
	text = "\n".join([para.text for para in doc.paragraphs])
	return text

	def extract_metadata_pdf(file):
	doc = fitz.open(file)
	metadata = doc.metadata
	return metadata

	def extract_metadata_word(file):
	doc = docx.Document(file)
	core_props = doc.core_properties
	metadata = {
	"author": core_props.author,
	"created": core_props.created,
	"modified": core_props.modified
	}
	return metadata

	# Function to compare text using difflib and return highlighted HTML differences
	def compare_texts(text1, text2):
	differ = HtmlDiff()
	return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)

	# Function to calculate similarity score
	def calculate_similarity(text1, text2):
	matcher = SequenceMatcher(None, text1, text2)
	return matcher.ratio()

	# Streamlit App Interface
	st.title("Document Edit Detection POC")

	st.write("Upload both the original and edited documents below:")

	# File upload
	original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"])
	edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])

	# Process if both files are uploaded
	if original_file and edited_file:
	# Identify file types
	original_ext = os.path.splitext(original_file.name)[1]
	edited_ext = os.path.splitext(edited_file.name)[1]

	# Check if both files are of the same type
	if original_ext != edited_ext:
	st.error("Both documents must be of the same type (PDF or DOCX).")
	else:
	# Extract text and metadata
	if original_ext == ".pdf":
	original_text = extract_text_pdf(original_file)
	edited_text = extract_text_pdf(edited_file)
	original_metadata = extract_metadata_pdf(original_file)
	edited_metadata = extract_metadata_pdf(edited_file)
	else:
	original_text = extract_text_word(original_file)
	edited_text = extract_text_word(edited_file)
	original_metadata = extract_metadata_word(original_file)
	edited_metadata = extract_metadata_word(edited_file)

	# Display Metadata
	st.subheader("Metadata Comparison")
	metadata_match = original_metadata == edited_metadata
	st.write("Metadata Match:", metadata_match)

	st.write("Original Document Metadata:")
	st.write(original_metadata)

	st.write("Edited Document Metadata:")
	st.write(edited_metadata)

	# Compare text
	st.subheader("Text Comparison")
	text_diff_html = compare_texts(original_text, edited_text)
	similarity_score = calculate_similarity(original_text, edited_text)

	st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
	text_match = similarity_score == 1.0
	st.write("Text Match:", text_match)

	# Display highlighted text differences
	st.write("Differences:")
	st.components.v1.html(text_diff_html, height=400, scrolling=True)

	# Report Generation
	st.subheader("Report Summary")
	st.write("Metadata Match:", metadata_match)
	st.write("Text Match:", text_match)
	st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
	else:
	st.info("Please upload both the original and edited documents to proceed.")