import os import docx from sentence_transformers import SentenceTransformer, util import gradio as gr import re from typing import List, Tuple, Dict import matplotlib.pyplot as plt import numpy as np from collections import defaultdict import base64 from io import BytesIO # Try to import PyMuPDF with proper error handling pymupdf_available = False try: import pymupdf pymupdf_available = True print("PyMuPDF imported successfully") except ImportError: try: import fitz pymupdf_available = True print("fitz imported successfully") except ImportError: print("PyMuPDF/fitz is not available. PDF extraction will not work.") # Initialize the SentenceTransformer model model = SentenceTransformer('all-MiniLM-L6-v2') def extract_text_from_pdf(pdf_path): if not pymupdf_available: return "PDF processing not available. Please install PyMuPDF." try: if 'pymupdf' in globals(): doc = pymupdf.open(pdf_path) else: import fitz doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text except Exception as e: print(f"Error extracting text from PDF: {str(e)}") return f"Error extracting PDF: {str(e)}" def extract_text_from_docx(docx_path): try: doc = docx.Document(docx_path) text = "\n".join([para.text for para in doc.paragraphs]) return text except Exception as e: print(f"Error extracting text from DOCX: {str(e)}") return f"Error extracting DOCX: {str(e)}" def preprocess_text(text: str) -> List[str]: """Split text into sentences and clean them""" if not text or text.strip() == "": return [] # Split into sentences using regex sentences = re.split(r'(? 10] return sentences def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple[str, str, float]]]: """Calculate similarity score and return similar sentence pairs""" # Preprocess texts into sentences sentences1 = preprocess_text(doc1) sentences2 = preprocess_text(doc2) if not sentences1 or not sentences2: return 0.0, [] # Get embeddings for all sentences embeddings1 = model.encode(sentences1, convert_to_tensor=True) embeddings2 = model.encode(sentences2, convert_to_tensor=True) # Calculate cosine similarities between all sentence pairs cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2) similarity_matrix = cosine_similarities.cpu().numpy() # Find the most similar sentences (all pairs for comprehensive analysis) all_pairs = [] for i in range(len(sentences1)): for j in range(len(sentences2)): similarity_score = similarity_matrix[i][j] if similarity_score > 0.3: # Include even lower similarities for comprehensive analysis all_pairs.append((sentences1[i], sentences2[j], similarity_score)) # Sort by similarity score (highest first) all_pairs.sort(key=lambda x: x[2], reverse=True) # Calculate overall similarity max_similarities1 = np.max(similarity_matrix, axis=1) max_similarities2 = np.max(similarity_matrix, axis=0) mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0 overall_similarity = mean_similarity return overall_similarity, all_pairs def create_similarity_barchart(all_pairs): """Create a bar chart showing similarity distribution across all levels""" if not all_pairs: return None plt.figure(figsize=(14, 8)) # Extract similarity scores scores = [pair[2] for pair in all_pairs] # Create bins for all similarity levels bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0] bin_labels = [ 'Slightly Related\n(30-49%)', 'Somewhat Related\n(50-69%)', 'Good Similarity\n(70-79%)', 'Strong Similarity\n(80-89%)', 'Very Strong Similarity\n(90-100%)' ] # Count pairs in each bin counts, _ = np.histogram(scores, bins=bins) # Create bar chart with colors for all levels colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666'] bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7) # Add value labels on bars for i, (count, bar) in enumerate(zip(counts, bars)): if count > 0: plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, str(count), ha='center', va='bottom', fontsize=12, fontweight='bold') plt.xlabel('Similarity Level', fontsize=14, fontweight='bold') plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold') plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20) plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11) # Remove top and right spines plt.gca().spines['top'].set_visible(False) plt.gca().spines['right'].set_visible(False) # Add grid for better readability plt.grid(axis='y', alpha=0.3) # Add explanation explanation_text = ( "This chart shows the complete range of similarity between all sentence pairs in your documents.\n" "Pairs with less than 30% similarity are not shown as they are considered not similar." ) plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic', bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5}) buf = BytesIO() plt.savefig(buf, format='png', dpi=100, bbox_inches='tight') plt.close() buf.seek(0) return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}" def create_similarity_summary(overall_similarity, all_pairs): """Create a text summary of the similarity analysis""" summary = f"## 📊 Complete Similarity Analysis\n\n" summary += f"**Overall Similarity Score:** {overall_similarity:.2%}\n\n" if all_pairs: # Count pairs in each category very_strong = len([p for p in all_pairs if p[2] >= 0.9]) strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9]) good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8]) somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7]) slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5]) summary += "**Similarity Breakdown:**\n" summary += f"- 🔴 Very Strong Similarity (90-100%): {very_strong} pairs\n" summary += f"- 🟡 Strong Similarity (80-89%): {strong} pairs\n" summary += f"- 🟠 Good Similarity (70-79%): {good} pairs\n" summary += f"- 🔵 Somewhat Related (50-69%): {somewhat_related} pairs\n" summary += f"- ⚪ Slightly Related (30-49%): {slightly_related} pairs\n" summary += f"- ❌ Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n" # Most common concepts in higher similarity pairs high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7] if high_similarity_pairs: concepts = { 'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'], 'Education': ['education', 'learn', 'course', 'degree', 'academic'], 'Experience': ['experience', 'work', 'job', 'intern', 'position'], 'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'], 'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability'] } concept_counts = {concept: 0 for concept in concepts.keys()} concept_counts['Other'] = 0 for sent1, sent2, score in high_similarity_pairs: matched = False for concept, keywords in concepts.items(): if any(keyword in sent1.lower() for keyword in keywords) or \ any(keyword in sent2.lower() for keyword in keywords): concept_counts[concept] += 1 matched = True break if not matched: concept_counts['Other'] += 1 summary += "**Highly Similar Content by Category:**\n" for concept, count in concept_counts.items(): if count > 0: summary += f"- {concept}: {count} pairs\n" else: summary += "No significant similarities found above the 30% threshold.\n" return summary def group_similar_concepts(all_pairs): """Group similar sentences by concept using keyword extraction""" concept_groups = defaultdict(list) concepts = { 'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'], 'Education': ['education', 'learn', 'course', 'degree', 'academic'], 'Experience': ['experience', 'work', 'job', 'intern', 'position'], 'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'], 'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability'] } for sent1, sent2, score in all_pairs: matched_concept = 'Other' for concept, keywords in concepts.items(): if any(keyword in sent1.lower() for keyword in keywords) or \ any(keyword in sent2.lower() for keyword in keywords): matched_concept = concept break concept_groups[matched_concept].append((sent1, sent2, score)) return concept_groups def get_similarity_color(score): """Get color based on similarity score""" if score >= 0.9: return "#ff6666" # Red - Very Strong elif score >= 0.8: return "#ffaa44" # Orange - Strong elif score >= 0.7: return "#ffcc66" # Yellow - Good elif score >= 0.5: return "#aaddff" # Blue - Somewhat Related else: return "#cccccc" # Gray - Slightly Related def similarity(file1, file2): if file1 is None or file2 is None: return "Please upload both documents.", None, None try: if file1.name.endswith('.pdf'): text1 = extract_text_from_pdf(file1.name) elif file1.name.endswith('.docx'): text1 = extract_text_from_docx(file1.name) else: return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None if file2.name.endswith('.pdf'): text2 = extract_text_from_pdf(file2.name) elif file2.name.endswith('.docx'): text2 = extract_text_from_docx(file2.name) else: return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None except Exception as e: return f"Error processing files: {str(e)}", None, None if not text1 or not text2 or "Error" in text1 or "Error" in text2: error_msg = "" if "Error" in text1: error_msg += f"Document 1: {text1} " if "Error" in text2: error_msg += f"Document 2: {text2}" return error_msg if error_msg else "Error extracting text from one or both documents.", None, None overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2) # Filter to show only higher similarity pairs in detailed view (70%+) high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7] concept_groups = group_similar_concepts(high_similarity_pairs) # Prepare detailed output output_html = f"
📄 Document 1: {sent1}
📄 Document 2: {sent2}
Similarity: {score:.2%}
⚠️ No significant similarities found above the 70% threshold.
" output_html += "No similarity data available for visualization
" if barchart_img: barchart_html = f'