File size: 15,616 Bytes
d574894
6982985
d574894
 
2f1cc59
 
d574894
2f1cc59
 
9ddc9f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d574894
2f1cc59
d574894
 
 
9ddc9f6
 
 
d574894
9ddc9f6
 
 
 
 
 
d574894
 
 
 
 
2f1cc59
9ddc9f6
d574894
 
 
 
2f1cc59
 
d574894
2f1cc59
9ddc9f6
d574894
2f1cc59
 
9ddc9f6
 
 
2f1cc59
 
 
 
 
d574894
2f1cc59
 
 
 
 
 
9ddc9f6
092f11f
9ddc9f6
2f1cc59
 
 
 
 
d574894
714e663
2f1cc59
28ea54b
 
2f1cc59
 
 
28ea54b
 
 
 
 
 
2f1cc59
 
714e663
 
 
 
2f1cc59
28ea54b
d574894
28ea54b
 
 
092f11f
6982985
28ea54b
6982985
092f11f
28ea54b
 
 
 
 
 
 
 
 
 
 
714e663
092f11f
 
714e663
28ea54b
 
 
714e663
 
 
28ea54b
 
 
092f11f
 
 
28ea54b
 
714e663
092f11f
 
 
 
 
 
 
 
28ea54b
 
 
 
 
 
6982985
9ddc9f6
 
d574894
9ddc9f6
6982985
092f11f
6982985
28ea54b
6982985
28ea54b
714e663
6982985
28ea54b
 
 
 
 
 
 
6982985
 
28ea54b
 
 
 
 
 
6982985
28ea54b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6982985
28ea54b
9ddc9f6
6982985
2f1cc59
28ea54b
2f1cc59
 
 
 
092f11f
 
 
 
 
2f1cc59
 
28ea54b
092f11f
2f1cc59
 
 
 
 
 
 
 
d574894
28ea54b
 
 
 
 
 
 
 
 
 
 
 
 
2f1cc59
9ddc9f6
092f11f
9ddc9f6
 
 
 
 
 
 
092f11f
9ddc9f6
 
 
 
 
 
092f11f
9ddc9f6
092f11f
9ddc9f6
 
 
 
 
 
 
092f11f
d574894
28ea54b
2f1cc59
28ea54b
 
 
2f1cc59
 
6982985
2f1cc59
28ea54b
 
2f1cc59
 
6982985
092f11f
2f1cc59
28ea54b
2f1cc59
6982985
 
 
 
2f1cc59
 
 
6982985
28ea54b
6982985
2f1cc59
28ea54b
 
 
6982985
092f11f
d574894
9ddc9f6
 
2f1cc59
072f334
 
2f1cc59
 
d574894
9ddc9f6
 
 
 
 
 
 
 
6982985
28ea54b
6982985
092f11f
28ea54b
 
092f11f
28ea54b
 
 
 
 
 
092f11f
 
9ddc9f6
 
 
092f11f
9ddc9f6
092f11f
 
28ea54b
6982985
092f11f
9ddc9f6
 
 
 
2f1cc59
092f11f
2f1cc59
 
9ddc9f6
2f1cc59
6982985
714e663
6982985
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
import os
import docx
from sentence_transformers import SentenceTransformer, util
import gradio as gr
import re
from typing import List, Tuple, Dict
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import base64
from io import BytesIO

# Try to import PyMuPDF with proper error handling
pymupdf_available = False
try:
    import pymupdf
    pymupdf_available = True
    print("PyMuPDF imported successfully")
except ImportError:
    try:
        import fitz
        pymupdf_available = True
        print("fitz imported successfully")
    except ImportError:
        print("PyMuPDF/fitz is not available. PDF extraction will not work.")

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_text_from_pdf(pdf_path):
    if not pymupdf_available:
        return "PDF processing not available. Please install PyMuPDF."
    
    try:
        if 'pymupdf' in globals():
            doc = pymupdf.open(pdf_path)
        else:
            import fitz
            doc = fitz.open(pdf_path)
            
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        return f"Error extracting PDF: {str(e)}"

def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        print(f"Error extracting text from DOCX: {str(e)}")
        return f"Error extracting DOCX: {str(e)}"

def preprocess_text(text: str) -> List[str]:
    """Split text into sentences and clean them"""
    if not text or text.strip() == "":
        return []
    
    # Split into sentences using regex
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
    # Clean sentences
    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
    return sentences

def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple[str, str, float]]]:
    """Calculate similarity score and return similar sentence pairs"""
    # Preprocess texts into sentences
    sentences1 = preprocess_text(doc1)
    sentences2 = preprocess_text(doc2)
    
    if not sentences1 or not sentences2:
        return 0.0, []
    
    # Get embeddings for all sentences
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)
    
    # Calculate cosine similarities between all sentence pairs
    cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
    similarity_matrix = cosine_similarities.cpu().numpy()
    
    # Find the most similar sentences (all pairs for comprehensive analysis)
    all_pairs = []
    
    for i in range(len(sentences1)):
        for j in range(len(sentences2)):
            similarity_score = similarity_matrix[i][j]
            if similarity_score > 0.3:  # Include even lower similarities for comprehensive analysis
                all_pairs.append((sentences1[i], sentences2[j], similarity_score))
    
    # Sort by similarity score (highest first)
    all_pairs.sort(key=lambda x: x[2], reverse=True)
    
    # Calculate overall similarity
    max_similarities1 = np.max(similarity_matrix, axis=1)
    max_similarities2 = np.max(similarity_matrix, axis=0)
    mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
    overall_similarity = mean_similarity
    
    return overall_similarity, all_pairs

def create_similarity_barchart(all_pairs):
    """Create a bar chart showing similarity distribution across all levels"""
    if not all_pairs:
        return None
    
    plt.figure(figsize=(14, 8))
    
    # Extract similarity scores
    scores = [pair[2] for pair in all_pairs]
    
    # Create bins for all similarity levels
    bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0]
    bin_labels = [
        'Slightly Related\n(30-49%)', 
        'Somewhat Related\n(50-69%)', 
        'Good Similarity\n(70-79%)',
        'Strong Similarity\n(80-89%)', 
        'Very Strong Similarity\n(90-100%)'
    ]
    
    # Count pairs in each bin
    counts, _ = np.histogram(scores, bins=bins)
    
    # Create bar chart with colors for all levels
    colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666']
    bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7)
    
    # Add value labels on bars
    for i, (count, bar) in enumerate(zip(counts, bars)):
        if count > 0:
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
                    str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
    
    plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
    plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
    plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20)
    plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11)
    
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Add grid for better readability
    plt.grid(axis='y', alpha=0.3)
    
    # Add explanation
    explanation_text = (
        "This chart shows the complete range of similarity between all sentence pairs in your documents.\n"
        "Pairs with less than 30% similarity are not shown as they are considered not similar."
    )
    plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic', 
                bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
    
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
    plt.close()
    buf.seek(0)
    
    return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"

def create_similarity_summary(overall_similarity, all_pairs):
    """Create a text summary of the similarity analysis"""
    summary = f"## πŸ“Š Complete Similarity Analysis\n\n"
    summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
    
    if all_pairs:
        # Count pairs in each category
        very_strong = len([p for p in all_pairs if p[2] >= 0.9])
        strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9])
        good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8])
        somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7])
        slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5])
        
        summary += "**Similarity Breakdown:**\n"
        summary += f"- πŸ”΄ Very Strong Similarity (90-100%): {very_strong} pairs\n"
        summary += f"- 🟑 Strong Similarity (80-89%): {strong} pairs\n"
        summary += f"- 🟠 Good Similarity (70-79%): {good} pairs\n"
        summary += f"- πŸ”΅ Somewhat Related (50-69%): {somewhat_related} pairs\n"
        summary += f"- βšͺ Slightly Related (30-49%): {slightly_related} pairs\n"
        summary += f"- ❌ Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n"
        
        # Most common concepts in higher similarity pairs
        high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
        if high_similarity_pairs:
            concepts = {
                'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
                'Education': ['education', 'learn', 'course', 'degree', 'academic'],
                'Experience': ['experience', 'work', 'job', 'intern', 'position'],
                'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
                'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
            }
            
            concept_counts = {concept: 0 for concept in concepts.keys()}
            concept_counts['Other'] = 0
            
            for sent1, sent2, score in high_similarity_pairs:
                matched = False
                for concept, keywords in concepts.items():
                    if any(keyword in sent1.lower() for keyword in keywords) or \
                       any(keyword in sent2.lower() for keyword in keywords):
                        concept_counts[concept] += 1
                        matched = True
                        break
                if not matched:
                    concept_counts['Other'] += 1
            
            summary += "**Highly Similar Content by Category:**\n"
            for concept, count in concept_counts.items():
                if count > 0:
                    summary += f"- {concept}: {count} pairs\n"
    else:
        summary += "No significant similarities found above the 30% threshold.\n"
    
    return summary

def group_similar_concepts(all_pairs):
    """Group similar sentences by concept using keyword extraction"""
    concept_groups = defaultdict(list)
    
    concepts = {
        'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
        'Education': ['education', 'learn', 'course', 'degree', 'academic'],
        'Experience': ['experience', 'work', 'job', 'intern', 'position'],
        'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
        'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
    }
    
    for sent1, sent2, score in all_pairs:
        matched_concept = 'Other'
        for concept, keywords in concepts.items():
            if any(keyword in sent1.lower() for keyword in keywords) or \
               any(keyword in sent2.lower() for keyword in keywords):
                matched_concept = concept
                break
        concept_groups[matched_concept].append((sent1, sent2, score))
    
    return concept_groups

def get_similarity_color(score):
    """Get color based on similarity score"""
    if score >= 0.9:
        return "#ff6666"  # Red - Very Strong
    elif score >= 0.8:
        return "#ffaa44"  # Orange - Strong
    elif score >= 0.7:
        return "#ffcc66"  # Yellow - Good
    elif score >= 0.5:
        return "#aaddff"  # Blue - Somewhat Related
    else:
        return "#cccccc"  # Gray - Slightly Related

def similarity(file1, file2):
    if file1 is None or file2 is None:
        return "Please upload both documents.", None, None
    
    try:
        if file1.name.endswith('.pdf'):
            text1 = extract_text_from_pdf(file1.name)
        elif file1.name.endswith('.docx'):
            text1 = extract_text_from_docx(file1.name)
        else:
            return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None
        
        if file2.name.endswith('.pdf'):
            text2 = extract_text_from_pdf(file2.name)
        elif file2.name.endswith('.docx'):
            text2 = extract_text_from_docx(file2.name)
        else:
            return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None
    except Exception as e:
        return f"Error processing files: {str(e)}", None, None
    
    if not text1 or not text2 or "Error" in text1 or "Error" in text2:
        error_msg = ""
        if "Error" in text1:
            error_msg += f"Document 1: {text1} "
        if "Error" in text2:
            error_msg += f"Document 2: {text2}"
        return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
    
    overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2)
    
    # Filter to show only higher similarity pairs in detailed view (70%+)
    high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
    concept_groups = group_similar_concepts(high_similarity_pairs)
    
    # Prepare detailed output
    output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
    
    if high_similarity_pairs:
        output_html += f"<h4>Found {len(high_similarity_pairs)} significant similar sentence pairs (70%+):</h4>"
        
        for concept, pairs in concept_groups.items():
            if pairs:
                output_html += f"<h5>πŸ” {concept}:</h5>"
                for i, (sent1, sent2, score) in enumerate(pairs):
                    color = get_similarity_color(score)
                    output_html += f"""
                    <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
                        <p><b>πŸ“„ Document 1:</b> {sent1}</p>
                        <p><b>πŸ“„ Document 2:</b> {sent2}</p>
                        <p><b>Similarity:</b> <span style='color: {color}; font-weight: bold;'>{score:.2%}</span></p>
                    </div>
                    """
    else:
        output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
        output_html += "<p>⚠️ No significant similarities found above the 70% threshold.</p>"
        output_html += "</div>"
    
    # Generate bar chart showing ALL similarity levels
    barchart_image = create_similarity_barchart(all_pairs)
    summary_text = create_similarity_summary(overall_similarity, all_pairs)
    
    return output_html, summary_text, barchart_image

# Create a clean Gradio interface
with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # πŸ“„ DocuTwin Finder
     Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Upload Documents")
            file1 = gr.File(label="Document 1", file_types=[".pdf", ".docx"])
            file2 = gr.File(label="Document 2", file_types=[".pdf", ".docx"])
            submit_btn = gr.Button("Compare Documents", variant="primary")
        
        with gr.Column(scale=2):
            gr.Markdown("### Analysis Results")
            summary_output = gr.Markdown()
            output_html = gr.HTML(label="Highly Similar Content (70%+)")
    
    gr.Markdown("""
    ### πŸ“Š Complete Similarity Distribution
    
    **Color Guide:**
    - πŸ”΄ Very Strong Similarity (90-100%) - Essentially identical content
    - 🟑 Strong Similarity (80-89%) - Very similar with minor differences  
    - 🟠 Good Similarity (70-79%) - Related concepts with noticeable differences
    - πŸ”΅ Somewhat Related (50-69%) - Shared concepts but different focus
    - βšͺ Slightly Related (30-49%) - Barely related topics
    - ❌ Not Similar (0-29%) - Completely different content (not shown)
    """)
    barchart_display = gr.HTML()
    
    # Define the processing function
    def process_files(file1, file2):
        result_html, summary_text, barchart_img = similarity(file1, file2)
        
        barchart_html = "<p>No similarity data available for visualization</p>"
        if barchart_img:
            barchart_html = f'<img src="{barchart_img}" alt="Complete Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
        
        return result_html, summary_text, barchart_html
    
    # Connect the button
    submit_btn.click(
        fn=process_files,
        inputs=[file1, file2],
        outputs=[output_html, summary_output, barchart_display]
    )

# Launch the application
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)