RenAzum commited on
Commit
d84f60a
1 Parent(s): 90d777d

Document Analysis

Browse files
Files changed (2) hide show
  1. main.py +107 -0
  2. requirements.txt +45 -0
main.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz # PyMuPDF
3
+ import docx
4
+ from difflib import HtmlDiff, SequenceMatcher
5
+ import os
6
+ import re
7
+
8
+ # Functions to extract text and metadata
9
+ def extract_text_pdf(file):
10
+ doc = fitz.open(file)
11
+ text = ""
12
+ for page in doc:
13
+ text += page.get_text()
14
+ return text
15
+
16
+ def extract_text_word(file):
17
+ doc = docx.Document(file)
18
+ text = "\n".join([para.text for para in doc.paragraphs])
19
+ return text
20
+
21
+ def extract_metadata_pdf(file):
22
+ doc = fitz.open(file)
23
+ metadata = doc.metadata
24
+ return metadata
25
+
26
+ def extract_metadata_word(file):
27
+ doc = docx.Document(file)
28
+ core_props = doc.core_properties
29
+ metadata = {
30
+ "author": core_props.author,
31
+ "created": core_props.created,
32
+ "modified": core_props.modified
33
+ }
34
+ return metadata
35
+
36
+ # Function to compare text using difflib and return highlighted HTML differences
37
+ def compare_texts(text1, text2):
38
+ differ = HtmlDiff()
39
+ return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)
40
+
41
+ # Function to calculate similarity score
42
+ def calculate_similarity(text1, text2):
43
+ matcher = SequenceMatcher(None, text1, text2)
44
+ return matcher.ratio()
45
+
46
+ # Streamlit App Interface
47
+ st.title("Document Edit Detection POC")
48
+
49
+ st.write("Upload both the original and edited documents below:")
50
+
51
+ # File upload
52
+ original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"])
53
+ edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])
54
+
55
+ # Process if both files are uploaded
56
+ if original_file and edited_file:
57
+ # Identify file types
58
+ original_ext = os.path.splitext(original_file.name)[1]
59
+ edited_ext = os.path.splitext(edited_file.name)[1]
60
+
61
+ # Check if both files are of the same type
62
+ if original_ext != edited_ext:
63
+ st.error("Both documents must be of the same type (PDF or DOCX).")
64
+ else:
65
+ # Extract text and metadata
66
+ if original_ext == ".pdf":
67
+ original_text = extract_text_pdf(original_file)
68
+ edited_text = extract_text_pdf(edited_file)
69
+ original_metadata = extract_metadata_pdf(original_file)
70
+ edited_metadata = extract_metadata_pdf(edited_file)
71
+ else:
72
+ original_text = extract_text_word(original_file)
73
+ edited_text = extract_text_word(edited_file)
74
+ original_metadata = extract_metadata_word(original_file)
75
+ edited_metadata = extract_metadata_word(edited_file)
76
+
77
+ # Display Metadata
78
+ st.subheader("Metadata Comparison")
79
+ metadata_match = original_metadata == edited_metadata
80
+ st.write("Metadata Match:", metadata_match)
81
+
82
+ st.write("Original Document Metadata:")
83
+ st.write(original_metadata)
84
+
85
+ st.write("Edited Document Metadata:")
86
+ st.write(edited_metadata)
87
+
88
+ # Compare text
89
+ st.subheader("Text Comparison")
90
+ text_diff_html = compare_texts(original_text, edited_text)
91
+ similarity_score = calculate_similarity(original_text, edited_text)
92
+
93
+ st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
94
+ text_match = similarity_score == 1.0
95
+ st.write("Text Match:", text_match)
96
+
97
+ # Display highlighted text differences
98
+ st.write("Differences:")
99
+ st.components.v1.html(text_diff_html, height=400, scrolling=True)
100
+
101
+ # Report Generation
102
+ st.subheader("Report Summary")
103
+ st.write("Metadata Match:", metadata_match)
104
+ st.write("Text Match:", text_match)
105
+ st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
106
+ else:
107
+ st.info("Please upload both the original and edited documents to proceed.")
requirements.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.4.1
2
+ attrs==24.2.0
3
+ blinker==1.9.0
4
+ cachetools==5.5.0
5
+ certifi==2024.8.30
6
+ charset-normalizer==3.4.0
7
+ click==8.1.7
8
+ colorama==0.4.6
9
+ gitdb==4.0.11
10
+ GitPython==3.1.43
11
+ idna==3.10
12
+ Jinja2==3.1.4
13
+ jsonschema==4.23.0
14
+ jsonschema-specifications==2024.10.1
15
+ lxml==5.3.0
16
+ markdown-it-py==3.0.0
17
+ MarkupSafe==3.0.2
18
+ mdurl==0.1.2
19
+ narwhals==1.13.4
20
+ numpy==2.1.3
21
+ packaging==24.2
22
+ pandas==2.2.3
23
+ pillow==11.0.0
24
+ protobuf==5.28.3
25
+ pyarrow==18.0.0
26
+ pydeck==0.9.1
27
+ Pygments==2.18.0
28
+ PyMuPDF==1.24.13
29
+ python-dateutil==2.9.0.post0
30
+ python-docx==1.1.2
31
+ pytz==2024.2
32
+ referencing==0.35.1
33
+ requests==2.32.3
34
+ rich==13.9.4
35
+ rpds-py==0.21.0
36
+ six==1.16.0
37
+ smmap==5.0.1
38
+ streamlit==1.40.1
39
+ tenacity==9.0.0
40
+ toml==0.10.2
41
+ tornado==6.4.1
42
+ typing_extensions==4.12.2
43
+ tzdata==2024.2
44
+ urllib3==2.2.3
45
+ watchdog==6.0.0