File size: 15,616 Bytes
d574894 6982985 d574894 2f1cc59 d574894 2f1cc59 9ddc9f6 d574894 2f1cc59 d574894 9ddc9f6 d574894 9ddc9f6 d574894 2f1cc59 9ddc9f6 d574894 2f1cc59 d574894 2f1cc59 9ddc9f6 d574894 2f1cc59 9ddc9f6 2f1cc59 d574894 2f1cc59 9ddc9f6 092f11f 9ddc9f6 2f1cc59 d574894 714e663 2f1cc59 28ea54b 2f1cc59 28ea54b 2f1cc59 714e663 2f1cc59 28ea54b d574894 28ea54b 092f11f 6982985 28ea54b 6982985 092f11f 28ea54b 714e663 092f11f 714e663 28ea54b 714e663 28ea54b 092f11f 28ea54b 714e663 092f11f 28ea54b 6982985 9ddc9f6 d574894 9ddc9f6 6982985 092f11f 6982985 28ea54b 6982985 28ea54b 714e663 6982985 28ea54b 6982985 28ea54b 6982985 28ea54b 6982985 28ea54b 9ddc9f6 6982985 2f1cc59 28ea54b 2f1cc59 092f11f 2f1cc59 28ea54b 092f11f 2f1cc59 d574894 28ea54b 2f1cc59 9ddc9f6 092f11f 9ddc9f6 092f11f 9ddc9f6 092f11f 9ddc9f6 092f11f 9ddc9f6 092f11f d574894 28ea54b 2f1cc59 28ea54b 2f1cc59 6982985 2f1cc59 28ea54b 2f1cc59 6982985 092f11f 2f1cc59 28ea54b 2f1cc59 6982985 2f1cc59 6982985 28ea54b 6982985 2f1cc59 28ea54b 6982985 092f11f d574894 9ddc9f6 2f1cc59 072f334 2f1cc59 d574894 9ddc9f6 6982985 28ea54b 6982985 092f11f 28ea54b 092f11f 28ea54b 092f11f 9ddc9f6 092f11f 9ddc9f6 092f11f 28ea54b 6982985 092f11f 9ddc9f6 2f1cc59 092f11f 2f1cc59 9ddc9f6 2f1cc59 6982985 714e663 6982985 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 |
import os
import docx
from sentence_transformers import SentenceTransformer, util
import gradio as gr
import re
from typing import List, Tuple, Dict
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import base64
from io import BytesIO
# Try to import PyMuPDF with proper error handling
pymupdf_available = False
try:
import pymupdf
pymupdf_available = True
print("PyMuPDF imported successfully")
except ImportError:
try:
import fitz
pymupdf_available = True
print("fitz imported successfully")
except ImportError:
print("PyMuPDF/fitz is not available. PDF extraction will not work.")
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
def extract_text_from_pdf(pdf_path):
if not pymupdf_available:
return "PDF processing not available. Please install PyMuPDF."
try:
if 'pymupdf' in globals():
doc = pymupdf.open(pdf_path)
else:
import fitz
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
except Exception as e:
print(f"Error extracting text from PDF: {str(e)}")
return f"Error extracting PDF: {str(e)}"
def extract_text_from_docx(docx_path):
try:
doc = docx.Document(docx_path)
text = "\n".join([para.text for para in doc.paragraphs])
return text
except Exception as e:
print(f"Error extracting text from DOCX: {str(e)}")
return f"Error extracting DOCX: {str(e)}"
def preprocess_text(text: str) -> List[str]:
"""Split text into sentences and clean them"""
if not text or text.strip() == "":
return []
# Split into sentences using regex
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
# Clean sentences
sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
return sentences
def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple[str, str, float]]]:
"""Calculate similarity score and return similar sentence pairs"""
# Preprocess texts into sentences
sentences1 = preprocess_text(doc1)
sentences2 = preprocess_text(doc2)
if not sentences1 or not sentences2:
return 0.0, []
# Get embeddings for all sentences
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
# Calculate cosine similarities between all sentence pairs
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
similarity_matrix = cosine_similarities.cpu().numpy()
# Find the most similar sentences (all pairs for comprehensive analysis)
all_pairs = []
for i in range(len(sentences1)):
for j in range(len(sentences2)):
similarity_score = similarity_matrix[i][j]
if similarity_score > 0.3: # Include even lower similarities for comprehensive analysis
all_pairs.append((sentences1[i], sentences2[j], similarity_score))
# Sort by similarity score (highest first)
all_pairs.sort(key=lambda x: x[2], reverse=True)
# Calculate overall similarity
max_similarities1 = np.max(similarity_matrix, axis=1)
max_similarities2 = np.max(similarity_matrix, axis=0)
mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
overall_similarity = mean_similarity
return overall_similarity, all_pairs
def create_similarity_barchart(all_pairs):
"""Create a bar chart showing similarity distribution across all levels"""
if not all_pairs:
return None
plt.figure(figsize=(14, 8))
# Extract similarity scores
scores = [pair[2] for pair in all_pairs]
# Create bins for all similarity levels
bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0]
bin_labels = [
'Slightly Related\n(30-49%)',
'Somewhat Related\n(50-69%)',
'Good Similarity\n(70-79%)',
'Strong Similarity\n(80-89%)',
'Very Strong Similarity\n(90-100%)'
]
# Count pairs in each bin
counts, _ = np.histogram(scores, bins=bins)
# Create bar chart with colors for all levels
colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666']
bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7)
# Add value labels on bars
for i, (count, bar) in enumerate(zip(counts, bars)):
if count > 0:
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20)
plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11)
# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# Add grid for better readability
plt.grid(axis='y', alpha=0.3)
# Add explanation
explanation_text = (
"This chart shows the complete range of similarity between all sentence pairs in your documents.\n"
"Pairs with less than 30% similarity are not shown as they are considered not similar."
)
plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic',
bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
buf = BytesIO()
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
plt.close()
buf.seek(0)
return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
def create_similarity_summary(overall_similarity, all_pairs):
"""Create a text summary of the similarity analysis"""
summary = f"## π Complete Similarity Analysis\n\n"
summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
if all_pairs:
# Count pairs in each category
very_strong = len([p for p in all_pairs if p[2] >= 0.9])
strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9])
good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8])
somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7])
slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5])
summary += "**Similarity Breakdown:**\n"
summary += f"- π΄ Very Strong Similarity (90-100%): {very_strong} pairs\n"
summary += f"- π‘ Strong Similarity (80-89%): {strong} pairs\n"
summary += f"- π Good Similarity (70-79%): {good} pairs\n"
summary += f"- π΅ Somewhat Related (50-69%): {somewhat_related} pairs\n"
summary += f"- βͺ Slightly Related (30-49%): {slightly_related} pairs\n"
summary += f"- β Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n"
# Most common concepts in higher similarity pairs
high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
if high_similarity_pairs:
concepts = {
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
'Education': ['education', 'learn', 'course', 'degree', 'academic'],
'Experience': ['experience', 'work', 'job', 'intern', 'position'],
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
}
concept_counts = {concept: 0 for concept in concepts.keys()}
concept_counts['Other'] = 0
for sent1, sent2, score in high_similarity_pairs:
matched = False
for concept, keywords in concepts.items():
if any(keyword in sent1.lower() for keyword in keywords) or \
any(keyword in sent2.lower() for keyword in keywords):
concept_counts[concept] += 1
matched = True
break
if not matched:
concept_counts['Other'] += 1
summary += "**Highly Similar Content by Category:**\n"
for concept, count in concept_counts.items():
if count > 0:
summary += f"- {concept}: {count} pairs\n"
else:
summary += "No significant similarities found above the 30% threshold.\n"
return summary
def group_similar_concepts(all_pairs):
"""Group similar sentences by concept using keyword extraction"""
concept_groups = defaultdict(list)
concepts = {
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
'Education': ['education', 'learn', 'course', 'degree', 'academic'],
'Experience': ['experience', 'work', 'job', 'intern', 'position'],
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
}
for sent1, sent2, score in all_pairs:
matched_concept = 'Other'
for concept, keywords in concepts.items():
if any(keyword in sent1.lower() for keyword in keywords) or \
any(keyword in sent2.lower() for keyword in keywords):
matched_concept = concept
break
concept_groups[matched_concept].append((sent1, sent2, score))
return concept_groups
def get_similarity_color(score):
"""Get color based on similarity score"""
if score >= 0.9:
return "#ff6666" # Red - Very Strong
elif score >= 0.8:
return "#ffaa44" # Orange - Strong
elif score >= 0.7:
return "#ffcc66" # Yellow - Good
elif score >= 0.5:
return "#aaddff" # Blue - Somewhat Related
else:
return "#cccccc" # Gray - Slightly Related
def similarity(file1, file2):
if file1 is None or file2 is None:
return "Please upload both documents.", None, None
try:
if file1.name.endswith('.pdf'):
text1 = extract_text_from_pdf(file1.name)
elif file1.name.endswith('.docx'):
text1 = extract_text_from_docx(file1.name)
else:
return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None
if file2.name.endswith('.pdf'):
text2 = extract_text_from_pdf(file2.name)
elif file2.name.endswith('.docx'):
text2 = extract_text_from_docx(file2.name)
else:
return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None
except Exception as e:
return f"Error processing files: {str(e)}", None, None
if not text1 or not text2 or "Error" in text1 or "Error" in text2:
error_msg = ""
if "Error" in text1:
error_msg += f"Document 1: {text1} "
if "Error" in text2:
error_msg += f"Document 2: {text2}"
return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2)
# Filter to show only higher similarity pairs in detailed view (70%+)
high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
concept_groups = group_similar_concepts(high_similarity_pairs)
# Prepare detailed output
output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
if high_similarity_pairs:
output_html += f"<h4>Found {len(high_similarity_pairs)} significant similar sentence pairs (70%+):</h4>"
for concept, pairs in concept_groups.items():
if pairs:
output_html += f"<h5>π {concept}:</h5>"
for i, (sent1, sent2, score) in enumerate(pairs):
color = get_similarity_color(score)
output_html += f"""
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
<p><b>π Document 1:</b> {sent1}</p>
<p><b>π Document 2:</b> {sent2}</p>
<p><b>Similarity:</b> <span style='color: {color}; font-weight: bold;'>{score:.2%}</span></p>
</div>
"""
else:
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
output_html += "<p>β οΈ No significant similarities found above the 70% threshold.</p>"
output_html += "</div>"
# Generate bar chart showing ALL similarity levels
barchart_image = create_similarity_barchart(all_pairs)
summary_text = create_similarity_summary(overall_similarity, all_pairs)
return output_html, summary_text, barchart_image
# Create a clean Gradio interface
with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π DocuTwin Finder
Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Upload Documents")
file1 = gr.File(label="Document 1", file_types=[".pdf", ".docx"])
file2 = gr.File(label="Document 2", file_types=[".pdf", ".docx"])
submit_btn = gr.Button("Compare Documents", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### Analysis Results")
summary_output = gr.Markdown()
output_html = gr.HTML(label="Highly Similar Content (70%+)")
gr.Markdown("""
### π Complete Similarity Distribution
**Color Guide:**
- π΄ Very Strong Similarity (90-100%) - Essentially identical content
- π‘ Strong Similarity (80-89%) - Very similar with minor differences
- π Good Similarity (70-79%) - Related concepts with noticeable differences
- π΅ Somewhat Related (50-69%) - Shared concepts but different focus
- βͺ Slightly Related (30-49%) - Barely related topics
- β Not Similar (0-29%) - Completely different content (not shown)
""")
barchart_display = gr.HTML()
# Define the processing function
def process_files(file1, file2):
result_html, summary_text, barchart_img = similarity(file1, file2)
barchart_html = "<p>No similarity data available for visualization</p>"
if barchart_img:
barchart_html = f'<img src="{barchart_img}" alt="Complete Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
return result_html, summary_text, barchart_html
# Connect the button
submit_btn.click(
fn=process_files,
inputs=[file1, file2],
outputs=[output_html, summary_output, barchart_display]
)
# Launch the application
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|