|
import os |
|
import docx |
|
from sentence_transformers import SentenceTransformer, util |
|
import gradio as gr |
|
import re |
|
from typing import List, Tuple, Dict |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
from collections import defaultdict |
|
import base64 |
|
from io import BytesIO |
|
|
|
|
|
pymupdf_available = False |
|
try: |
|
import pymupdf |
|
pymupdf_available = True |
|
print("PyMuPDF imported successfully") |
|
except ImportError: |
|
try: |
|
import fitz |
|
pymupdf_available = True |
|
print("fitz imported successfully") |
|
except ImportError: |
|
print("PyMuPDF/fitz is not available. PDF extraction will not work.") |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
if not pymupdf_available: |
|
return "PDF processing not available. Please install PyMuPDF." |
|
|
|
try: |
|
if 'pymupdf' in globals(): |
|
doc = pymupdf.open(pdf_path) |
|
else: |
|
import fitz |
|
doc = fitz.open(pdf_path) |
|
|
|
text = "" |
|
for page in doc: |
|
text += page.get_text() |
|
return text |
|
except Exception as e: |
|
print(f"Error extracting text from PDF: {str(e)}") |
|
return f"Error extracting PDF: {str(e)}" |
|
|
|
def extract_text_from_docx(docx_path): |
|
try: |
|
doc = docx.Document(docx_path) |
|
text = "\n".join([para.text for para in doc.paragraphs]) |
|
return text |
|
except Exception as e: |
|
print(f"Error extracting text from DOCX: {str(e)}") |
|
return f"Error extracting DOCX: {str(e)}" |
|
|
|
def preprocess_text(text: str) -> List[str]: |
|
"""Split text into sentences and clean them""" |
|
if not text or text.strip() == "": |
|
return [] |
|
|
|
|
|
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) |
|
|
|
sentences = [s.strip() for s in sentences if len(s.strip()) > 10] |
|
return sentences |
|
|
|
def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple[str, str, float]]]: |
|
"""Calculate similarity score and return similar sentence pairs""" |
|
|
|
sentences1 = preprocess_text(doc1) |
|
sentences2 = preprocess_text(doc2) |
|
|
|
if not sentences1 or not sentences2: |
|
return 0.0, [] |
|
|
|
|
|
embeddings1 = model.encode(sentences1, convert_to_tensor=True) |
|
embeddings2 = model.encode(sentences2, convert_to_tensor=True) |
|
|
|
|
|
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2) |
|
similarity_matrix = cosine_similarities.cpu().numpy() |
|
|
|
|
|
all_pairs = [] |
|
|
|
for i in range(len(sentences1)): |
|
for j in range(len(sentences2)): |
|
similarity_score = similarity_matrix[i][j] |
|
if similarity_score > 0.3: |
|
all_pairs.append((sentences1[i], sentences2[j], similarity_score)) |
|
|
|
|
|
all_pairs.sort(key=lambda x: x[2], reverse=True) |
|
|
|
|
|
max_similarities1 = np.max(similarity_matrix, axis=1) |
|
max_similarities2 = np.max(similarity_matrix, axis=0) |
|
mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0 |
|
overall_similarity = mean_similarity |
|
|
|
return overall_similarity, all_pairs |
|
|
|
def create_similarity_barchart(all_pairs): |
|
"""Create a bar chart showing similarity distribution across all levels""" |
|
if not all_pairs: |
|
return None |
|
|
|
plt.figure(figsize=(14, 8)) |
|
|
|
|
|
scores = [pair[2] for pair in all_pairs] |
|
|
|
|
|
bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0] |
|
bin_labels = [ |
|
'Slightly Related\n(30-49%)', |
|
'Somewhat Related\n(50-69%)', |
|
'Good Similarity\n(70-79%)', |
|
'Strong Similarity\n(80-89%)', |
|
'Very Strong Similarity\n(90-100%)' |
|
] |
|
|
|
|
|
counts, _ = np.histogram(scores, bins=bins) |
|
|
|
|
|
colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666'] |
|
bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7) |
|
|
|
|
|
for i, (count, bar) in enumerate(zip(counts, bars)): |
|
if count > 0: |
|
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, |
|
str(count), ha='center', va='bottom', fontsize=12, fontweight='bold') |
|
|
|
plt.xlabel('Similarity Level', fontsize=14, fontweight='bold') |
|
plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold') |
|
plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20) |
|
plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11) |
|
|
|
|
|
plt.gca().spines['top'].set_visible(False) |
|
plt.gca().spines['right'].set_visible(False) |
|
|
|
|
|
plt.grid(axis='y', alpha=0.3) |
|
|
|
|
|
explanation_text = ( |
|
"This chart shows the complete range of similarity between all sentence pairs in your documents.\n" |
|
"Pairs with less than 30% similarity are not shown as they are considered not similar." |
|
) |
|
plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic', |
|
bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5}) |
|
|
|
buf = BytesIO() |
|
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight') |
|
plt.close() |
|
buf.seek(0) |
|
|
|
return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}" |
|
|
|
def create_similarity_summary(overall_similarity, all_pairs): |
|
"""Create a text summary of the similarity analysis""" |
|
summary = f"## π Complete Similarity Analysis\n\n" |
|
summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n" |
|
|
|
if all_pairs: |
|
|
|
very_strong = len([p for p in all_pairs if p[2] >= 0.9]) |
|
strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9]) |
|
good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8]) |
|
somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7]) |
|
slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5]) |
|
|
|
summary += "**Similarity Breakdown:**\n" |
|
summary += f"- π΄ Very Strong Similarity (90-100%): {very_strong} pairs\n" |
|
summary += f"- π‘ Strong Similarity (80-89%): {strong} pairs\n" |
|
summary += f"- π Good Similarity (70-79%): {good} pairs\n" |
|
summary += f"- π΅ Somewhat Related (50-69%): {somewhat_related} pairs\n" |
|
summary += f"- βͺ Slightly Related (30-49%): {slightly_related} pairs\n" |
|
summary += f"- β Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n" |
|
|
|
|
|
high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7] |
|
if high_similarity_pairs: |
|
concepts = { |
|
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'], |
|
'Education': ['education', 'learn', 'course', 'degree', 'academic'], |
|
'Experience': ['experience', 'work', 'job', 'intern', 'position'], |
|
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'], |
|
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability'] |
|
} |
|
|
|
concept_counts = {concept: 0 for concept in concepts.keys()} |
|
concept_counts['Other'] = 0 |
|
|
|
for sent1, sent2, score in high_similarity_pairs: |
|
matched = False |
|
for concept, keywords in concepts.items(): |
|
if any(keyword in sent1.lower() for keyword in keywords) or \ |
|
any(keyword in sent2.lower() for keyword in keywords): |
|
concept_counts[concept] += 1 |
|
matched = True |
|
break |
|
if not matched: |
|
concept_counts['Other'] += 1 |
|
|
|
summary += "**Highly Similar Content by Category:**\n" |
|
for concept, count in concept_counts.items(): |
|
if count > 0: |
|
summary += f"- {concept}: {count} pairs\n" |
|
else: |
|
summary += "No significant similarities found above the 30% threshold.\n" |
|
|
|
return summary |
|
|
|
def group_similar_concepts(all_pairs): |
|
"""Group similar sentences by concept using keyword extraction""" |
|
concept_groups = defaultdict(list) |
|
|
|
concepts = { |
|
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'], |
|
'Education': ['education', 'learn', 'course', 'degree', 'academic'], |
|
'Experience': ['experience', 'work', 'job', 'intern', 'position'], |
|
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'], |
|
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability'] |
|
} |
|
|
|
for sent1, sent2, score in all_pairs: |
|
matched_concept = 'Other' |
|
for concept, keywords in concepts.items(): |
|
if any(keyword in sent1.lower() for keyword in keywords) or \ |
|
any(keyword in sent2.lower() for keyword in keywords): |
|
matched_concept = concept |
|
break |
|
concept_groups[matched_concept].append((sent1, sent2, score)) |
|
|
|
return concept_groups |
|
|
|
def get_similarity_color(score): |
|
"""Get color based on similarity score""" |
|
if score >= 0.9: |
|
return "#ff6666" |
|
elif score >= 0.8: |
|
return "#ffaa44" |
|
elif score >= 0.7: |
|
return "#ffcc66" |
|
elif score >= 0.5: |
|
return "#aaddff" |
|
else: |
|
return "#cccccc" |
|
|
|
def similarity(file1, file2): |
|
if file1 is None or file2 is None: |
|
return "Please upload both documents.", None, None |
|
|
|
try: |
|
if file1.name.endswith('.pdf'): |
|
text1 = extract_text_from_pdf(file1.name) |
|
elif file1.name.endswith('.docx'): |
|
text1 = extract_text_from_docx(file1.name) |
|
else: |
|
return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None |
|
|
|
if file2.name.endswith('.pdf'): |
|
text2 = extract_text_from_pdf(file2.name) |
|
elif file2.name.endswith('.docx'): |
|
text2 = extract_text_from_docx(file2.name) |
|
else: |
|
return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None |
|
except Exception as e: |
|
return f"Error processing files: {str(e)}", None, None |
|
|
|
if not text1 or not text2 or "Error" in text1 or "Error" in text2: |
|
error_msg = "" |
|
if "Error" in text1: |
|
error_msg += f"Document 1: {text1} " |
|
if "Error" in text2: |
|
error_msg += f"Document 2: {text2}" |
|
return error_msg if error_msg else "Error extracting text from one or both documents.", None, None |
|
|
|
overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2) |
|
|
|
|
|
high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7] |
|
concept_groups = group_similar_concepts(high_similarity_pairs) |
|
|
|
|
|
output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>" |
|
|
|
if high_similarity_pairs: |
|
output_html += f"<h4>Found {len(high_similarity_pairs)} significant similar sentence pairs (70%+):</h4>" |
|
|
|
for concept, pairs in concept_groups.items(): |
|
if pairs: |
|
output_html += f"<h5>π {concept}:</h5>" |
|
for i, (sent1, sent2, score) in enumerate(pairs): |
|
color = get_similarity_color(score) |
|
output_html += f""" |
|
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};"> |
|
<p><b>π Document 1:</b> {sent1}</p> |
|
<p><b>π Document 2:</b> {sent2}</p> |
|
<p><b>Similarity:</b> <span style='color: {color}; font-weight: bold;'>{score:.2%}</span></p> |
|
</div> |
|
""" |
|
else: |
|
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>" |
|
output_html += "<p>β οΈ No significant similarities found above the 70% threshold.</p>" |
|
output_html += "</div>" |
|
|
|
|
|
barchart_image = create_similarity_barchart(all_pairs) |
|
summary_text = create_similarity_summary(overall_similarity, all_pairs) |
|
|
|
return output_html, summary_text, barchart_image |
|
|
|
|
|
with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo: |
|
gr.Markdown(""" |
|
# π DocuTwin Finder |
|
Upload two documents (PDF or DOCX) to compare their content and identify specific similarities. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### Upload Documents") |
|
file1 = gr.File(label="Document 1", file_types=[".pdf", ".docx"]) |
|
file2 = gr.File(label="Document 2", file_types=[".pdf", ".docx"]) |
|
submit_btn = gr.Button("Compare Documents", variant="primary") |
|
|
|
with gr.Column(scale=2): |
|
gr.Markdown("### Analysis Results") |
|
summary_output = gr.Markdown() |
|
output_html = gr.HTML(label="Highly Similar Content (70%+)") |
|
|
|
gr.Markdown(""" |
|
### π Complete Similarity Distribution |
|
|
|
**Color Guide:** |
|
- π΄ Very Strong Similarity (90-100%) - Essentially identical content |
|
- π‘ Strong Similarity (80-89%) - Very similar with minor differences |
|
- π Good Similarity (70-79%) - Related concepts with noticeable differences |
|
- π΅ Somewhat Related (50-69%) - Shared concepts but different focus |
|
- βͺ Slightly Related (30-49%) - Barely related topics |
|
- β Not Similar (0-29%) - Completely different content (not shown) |
|
""") |
|
barchart_display = gr.HTML() |
|
|
|
|
|
def process_files(file1, file2): |
|
result_html, summary_text, barchart_img = similarity(file1, file2) |
|
|
|
barchart_html = "<p>No similarity data available for visualization</p>" |
|
if barchart_img: |
|
barchart_html = f'<img src="{barchart_img}" alt="Complete Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">' |
|
|
|
return result_html, summary_text, barchart_html |
|
|
|
|
|
submit_btn.click( |
|
fn=process_files, |
|
inputs=[file1, file2], |
|
outputs=[output_html, summary_output, barchart_display] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|
|
|