Spaces:

NaimaAqeel
/

document-similarity-checker

Running

App Files Files Community

document-similarity-checker / app.py

NaimaAqeel

Update app.py

072f334 verified 5 days ago

raw

history blame contribute delete

15.6 kB

	import os
	import docx
	from sentence_transformers import SentenceTransformer, util
	import gradio as gr
	import re
	from typing import List, Tuple, Dict
	import matplotlib.pyplot as plt
	import numpy as np
	from collections import defaultdict
	import base64
	from io import BytesIO

	# Try to import PyMuPDF with proper error handling
	pymupdf_available = False
	try:
	import pymupdf
	pymupdf_available = True
	print("PyMuPDF imported successfully")
	except ImportError:
	try:
	import fitz
	pymupdf_available = True
	print("fitz imported successfully")
	except ImportError:
	print("PyMuPDF/fitz is not available. PDF extraction will not work.")

	# Initialize the SentenceTransformer model
	model = SentenceTransformer('all-MiniLM-L6-v2')

	def extract_text_from_pdf(pdf_path):
	if not pymupdf_available:
	return "PDF processing not available. Please install PyMuPDF."

	try:
	if 'pymupdf' in globals():
	doc = pymupdf.open(pdf_path)
	else:
	import fitz
	doc = fitz.open(pdf_path)

	text = ""
	for page in doc:
	text += page.get_text()
	return text
	except Exception as e:
	print(f"Error extracting text from PDF: {str(e)}")
	return f"Error extracting PDF: {str(e)}"

	def extract_text_from_docx(docx_path):
	try:
	doc = docx.Document(docx_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	return text
	except Exception as e:
	print(f"Error extracting text from DOCX: {str(e)}")
	return f"Error extracting DOCX: {str(e)}"

	def preprocess_text(text: str) -> List[str]:
	"""Split text into sentences and clean them"""
	if not text or text.strip() == "":
	return []

	# Split into sentences using regex
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?\|\!)\s', text)
	# Clean sentences
	sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
	return sentences

	def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple[str, str, float]]]:
	"""Calculate similarity score and return similar sentence pairs"""
	# Preprocess texts into sentences
	sentences1 = preprocess_text(doc1)
	sentences2 = preprocess_text(doc2)

	if not sentences1 or not sentences2:
	return 0.0, []

	# Get embeddings for all sentences
	embeddings1 = model.encode(sentences1, convert_to_tensor=True)
	embeddings2 = model.encode(sentences2, convert_to_tensor=True)

	# Calculate cosine similarities between all sentence pairs
	cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
	similarity_matrix = cosine_similarities.cpu().numpy()

	# Find the most similar sentences (all pairs for comprehensive analysis)
	all_pairs = []

	for i in range(len(sentences1)):
	for j in range(len(sentences2)):
	similarity_score = similarity_matrix[i][j]
	if similarity_score > 0.3: # Include even lower similarities for comprehensive analysis
	all_pairs.append((sentences1[i], sentences2[j], similarity_score))

	# Sort by similarity score (highest first)
	all_pairs.sort(key=lambda x: x[2], reverse=True)

	# Calculate overall similarity
	max_similarities1 = np.max(similarity_matrix, axis=1)
	max_similarities2 = np.max(similarity_matrix, axis=0)
	mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
	overall_similarity = mean_similarity

	return overall_similarity, all_pairs

	def create_similarity_barchart(all_pairs):
	"""Create a bar chart showing similarity distribution across all levels"""
	if not all_pairs:
	return None

	plt.figure(figsize=(14, 8))

	# Extract similarity scores
	scores = [pair[2] for pair in all_pairs]

	# Create bins for all similarity levels
	bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0]
	bin_labels = [
	'Slightly Related\n(30-49%)',
	'Somewhat Related\n(50-69%)',
	'Good Similarity\n(70-79%)',
	'Strong Similarity\n(80-89%)',
	'Very Strong Similarity\n(90-100%)'
	]

	# Count pairs in each bin
	counts, _ = np.histogram(scores, bins=bins)

	# Create bar chart with colors for all levels
	colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666']
	bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7)

	# Add value labels on bars
	for i, (count, bar) in enumerate(zip(counts, bars)):
	if count > 0:
	plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
	str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')

	plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
	plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
	plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20)
	plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11)

	# Remove top and right spines
	plt.gca().spines['top'].set_visible(False)
	plt.gca().spines['right'].set_visible(False)

	# Add grid for better readability
	plt.grid(axis='y', alpha=0.3)

	# Add explanation
	explanation_text = (
	"This chart shows the complete range of similarity between all sentence pairs in your documents.\n"
	"Pairs with less than 30% similarity are not shown as they are considered not similar."
	)
	plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic',
	bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})

	buf = BytesIO()
	plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
	plt.close()
	buf.seek(0)

	return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"

	def create_similarity_summary(overall_similarity, all_pairs):
	"""Create a text summary of the similarity analysis"""
	summary = f"## 📊 Complete Similarity Analysis\n\n"
	summary += f"Overall Similarity Score: <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"

	if all_pairs:
	# Count pairs in each category
	very_strong = len([p for p in all_pairs if p[2] >= 0.9])
	strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9])
	good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8])
	somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7])
	slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5])

	summary += "Similarity Breakdown:\n"
	summary += f"- 🔴 Very Strong Similarity (90-100%): {very_strong} pairs\n"
	summary += f"- 🟡 Strong Similarity (80-89%): {strong} pairs\n"
	summary += f"- 🟠 Good Similarity (70-79%): {good} pairs\n"
	summary += f"- 🔵 Somewhat Related (50-69%): {somewhat_related} pairs\n"
	summary += f"- ⚪ Slightly Related (30-49%): {slightly_related} pairs\n"
	summary += f"- ❌ Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n"

	# Most common concepts in higher similarity pairs
	high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
	if high_similarity_pairs:
	concepts = {
	'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
	'Education': ['education', 'learn', 'course', 'degree', 'academic'],
	'Experience': ['experience', 'work', 'job', 'intern', 'position'],
	'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
	'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
	}

	concept_counts = {concept: 0 for concept in concepts.keys()}
	concept_counts['Other'] = 0

	for sent1, sent2, score in high_similarity_pairs:
	matched = False
	for concept, keywords in concepts.items():
	if any(keyword in sent1.lower() for keyword in keywords) or \
	any(keyword in sent2.lower() for keyword in keywords):
	concept_counts[concept] += 1
	matched = True
	break
	if not matched:
	concept_counts['Other'] += 1

	summary += "Highly Similar Content by Category:\n"
	for concept, count in concept_counts.items():
	if count > 0:
	summary += f"- {concept}: {count} pairs\n"
	else:
	summary += "No significant similarities found above the 30% threshold.\n"

	return summary

	def group_similar_concepts(all_pairs):
	"""Group similar sentences by concept using keyword extraction"""
	concept_groups = defaultdict(list)

	concepts = {
	'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
	'Education': ['education', 'learn', 'course', 'degree', 'academic'],
	'Experience': ['experience', 'work', 'job', 'intern', 'position'],
	'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
	'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
	}

	for sent1, sent2, score in all_pairs:
	matched_concept = 'Other'
	for concept, keywords in concepts.items():
	if any(keyword in sent1.lower() for keyword in keywords) or \
	any(keyword in sent2.lower() for keyword in keywords):
	matched_concept = concept
	break
	concept_groups[matched_concept].append((sent1, sent2, score))

	return concept_groups

	def get_similarity_color(score):
	"""Get color based on similarity score"""
	if score >= 0.9:
	return "#ff6666" # Red - Very Strong
	elif score >= 0.8:
	return "#ffaa44" # Orange - Strong
	elif score >= 0.7:
	return "#ffcc66" # Yellow - Good
	elif score >= 0.5:
	return "#aaddff" # Blue - Somewhat Related
	else:
	return "#cccccc" # Gray - Slightly Related

	def similarity(file1, file2):
	if file1 is None or file2 is None:
	return "Please upload both documents.", None, None

	try:
	if file1.name.endswith('.pdf'):
	text1 = extract_text_from_pdf(file1.name)
	elif file1.name.endswith('.docx'):
	text1 = extract_text_from_docx(file1.name)
	else:
	return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None

	if file2.name.endswith('.pdf'):
	text2 = extract_text_from_pdf(file2.name)
	elif file2.name.endswith('.docx'):
	text2 = extract_text_from_docx(file2.name)
	else:
	return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None
	except Exception as e:
	return f"Error processing files: {str(e)}", None, None

	if not text1 or not text2 or "Error" in text1 or "Error" in text2:
	error_msg = ""
	if "Error" in text1:
	error_msg += f"Document 1: {text1} "
	if "Error" in text2:
	error_msg += f"Document 2: {text2}"
	return error_msg if error_msg else "Error extracting text from one or both documents.", None, None

	overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2)

	# Filter to show only higher similarity pairs in detailed view (70%+)
	high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
	concept_groups = group_similar_concepts(high_similarity_pairs)

	# Prepare detailed output
	output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"

	if high_similarity_pairs:
	output_html += f"<h4>Found {len(high_similarity_pairs)} significant similar sentence pairs (70%+):</h4>"

	for concept, pairs in concept_groups.items():
	if pairs:
	output_html += f"<h5>🔍 {concept}:</h5>"
	for i, (sent1, sent2, score) in enumerate(pairs):
	color = get_similarity_color(score)
	output_html += f"""
	<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
	<p><b>📄 Document 1:</b> {sent1}</p>
	<p><b>📄 Document 2:</b> {sent2}</p>
	<p><b>Similarity:</b> <span style='color: {color}; font-weight: bold;'>{score:.2%}</span></p>
	</div>
	"""
	else:
	output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
	output_html += "<p>⚠️ No significant similarities found above the 70% threshold.</p>"
	output_html += "</div>"

	# Generate bar chart showing ALL similarity levels
	barchart_image = create_similarity_barchart(all_pairs)
	summary_text = create_similarity_summary(overall_similarity, all_pairs)

	return output_html, summary_text, barchart_image

	# Create a clean Gradio interface
	with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 📄 DocuTwin Finder
	Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Upload Documents")
	file1 = gr.File(label="Document 1", file_types=[".pdf", ".docx"])
	file2 = gr.File(label="Document 2", file_types=[".pdf", ".docx"])
	submit_btn = gr.Button("Compare Documents", variant="primary")

	with gr.Column(scale=2):
	gr.Markdown("### Analysis Results")
	summary_output = gr.Markdown()
	output_html = gr.HTML(label="Highly Similar Content (70%+)")

	gr.Markdown("""
	### 📊 Complete Similarity Distribution

	Color Guide:
	- 🔴 Very Strong Similarity (90-100%) - Essentially identical content
	- 🟡 Strong Similarity (80-89%) - Very similar with minor differences
	- 🟠 Good Similarity (70-79%) - Related concepts with noticeable differences
	- 🔵 Somewhat Related (50-69%) - Shared concepts but different focus
	- ⚪ Slightly Related (30-49%) - Barely related topics
	- ❌ Not Similar (0-29%) - Completely different content (not shown)
	""")
	barchart_display = gr.HTML()

	# Define the processing function
	def process_files(file1, file2):
	result_html, summary_text, barchart_img = similarity(file1, file2)

	barchart_html = "<p>No similarity data available for visualization</p>"
	if barchart_img:
	barchart_html = f'<img src="{barchart_img}" alt="Complete Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'

	return result_html, summary_text, barchart_html

	# Connect the button
	submit_btn.click(
	fn=process_files,
	inputs=[file1, file2],
	outputs=[output_html, summary_output, barchart_display]
	)

	# Launch the application
	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)