Spaces:

syedmohaiminulhoque
/

agentic-doc-sim-streamlit

Running

App Files Files Community

agentic-doc-sim-streamlit / src /utils /visualization.py

syedmohaiminulhoque

feat: Implement Phase 2 enhancements for multi-modal document comparison

b73db8b 17 days ago

raw

history blame contribute delete

7.41 kB

	"""
	Visualization utilities for displaying similarity results.
	"""
	import plotly.graph_objects as go
	import plotly.express as px
	from typing import List, Dict, Any
	import difflib

	from models.similarity import SimilarityReport, ModalityScore


	def create_similarity_gauge(score: float, title: str = "Overall Similarity") -> go.Figure:
	"""
	Create a gauge chart showing similarity score.

	Args:
	score: Similarity score (0.0 to 1.0)
	title: Chart title

	Returns:
	Plotly figure
	"""
	# Determine color based on score
	if score >= 0.7:
	color = "green"
	elif score >= 0.4:
	color = "orange"
	else:
	color = "red"

	fig = go.Figure(go.Indicator(
	mode="gauge+number+delta",
	value=score * 100, # Convert to percentage
	domain={'x': [0, 1], 'y': [0, 1]},
	title={'text': title, 'font': {'size': 24}},
	number={'suffix': "%", 'font': {'size': 40}},
	gauge={
	'axis': {'range': [None, 100], 'tickwidth': 1, 'tickcolor': "darkblue"},
	'bar': {'color': color},
	'bgcolor': "white",
	'borderwidth': 2,
	'bordercolor': "gray",
	'steps': [
	{'range': [0, 40], 'color': '#ffcccc'},
	{'range': [40, 70], 'color': '#fff4cc'},
	{'range': [70, 100], 'color': '#ccffcc'}
	],
	'threshold': {
	'line': {'color': "red", 'width': 4},
	'thickness': 0.75,
	'value': 90
	}
	}
	))

	fig.update_layout(
	height=300,
	margin=dict(l=20, r=20, t=60, b=20)
	)

	return fig


	def create_modality_breakdown_chart(report: SimilarityReport) -> go.Figure:
	"""
	Create a bar chart showing per-modality similarity scores.

	Args:
	report: SimilarityReport object

	Returns:
	Plotly figure
	"""
	modalities = []
	scores = []
	weights = []

	# Add all available modalities
	if report.text_score:
	modalities.append("Text")
	scores.append(report.text_score.score * 100)
	weights.append(report.weights_used.get("text", 0) * 100)

	if report.table_score:
	modalities.append("Table")
	scores.append(report.table_score.score * 100)
	weights.append(report.weights_used.get("table", 0) * 100)

	if report.image_score:
	modalities.append("Image")
	scores.append(report.image_score.score * 100)
	weights.append(report.weights_used.get("image", 0) * 100)

	if report.layout_score:
	modalities.append("Layout")
	scores.append(report.layout_score.score * 100)
	weights.append(report.weights_used.get("layout", 0) * 100)

	if report.metadata_score:
	modalities.append("Metadata")
	scores.append(report.metadata_score.score * 100)
	weights.append(report.weights_used.get("metadata", 0) * 100)

	# Create bar chart
	fig = go.Figure()

	fig.add_trace(go.Bar(
	name='Similarity Score',
	x=modalities,
	y=scores,
	marker_color='lightblue',
	text=[f"{s:.1f}%" for s in scores],
	textposition='auto',
	))

	fig.add_trace(go.Bar(
	name='Weight',
	x=modalities,
	y=weights,
	marker_color='lightcoral',
	text=[f"{w:.0f}%" for w in weights],
	textposition='auto',
	))

	fig.update_layout(
	title="Per-Modality Similarity Breakdown",
	xaxis_title="Modality",
	yaxis_title="Percentage (%)",
	yaxis_range=[0, 100],
	barmode='group',
	height=400,
	showlegend=True
	)

	return fig


	def format_matched_sections(matched_sections: List[Dict[str, Any]]) -> str:
	"""
	Format matched sections for display.

	Args:
	matched_sections: List of matched section dictionaries

	Returns:
	Formatted string
	"""
	if not matched_sections:
	return "No matched sections found."

	output = []
	for idx, section in enumerate(matched_sections, start=1):
	section_type = section.get("type", "unknown")
	similarity = section.get("similarity", 0.0)

	output.append(f"Match {idx} ({section_type.upper()}) - Similarity: {similarity:.2%}")
	output.append("")

	if section_type == "text":
	output.append(f"📄 Doc 1 (Page {section.get('doc1_page', '?')}):")
	output.append(f"```\n{section.get('doc1_content', '')}\n```")
	output.append("")
	output.append(f"📄 Doc 2 (Page {section.get('doc2_page', '?')}):")
	output.append(f"```\n{section.get('doc2_content', '')}\n```")

	elif section_type == "table":
	output.append(f"📊 Doc 1 Table (Page {section.get('doc1_page', '?')}):")
	output.append(f"_{section.get('doc1_schema', '')}_")
	output.append("")
	output.append(f"📊 Doc 2 Table (Page {section.get('doc2_page', '?')}):")
	output.append(f"_{section.get('doc2_schema', '')}_")

	elif section_type == "image":
	output.append(f"🖼️ Doc 1 Image (Page {section.get('doc1_page', '?')}):")
	output.append(f"_Image ID: {section.get('doc1_image_id', 'N/A')}_")
	output.append("")
	output.append(f"🖼️ Doc 2 Image (Page {section.get('doc2_page', '?')}):")
	output.append(f"_Image ID: {section.get('doc2_image_id', 'N/A')}_")

	elif section_type == "metadata":
	output.append(f"📋 Field: {section.get('field', 'unknown').title()}")
	output.append(f"- Doc 1: {section.get('doc1_value', 'N/A')}")
	output.append(f"- Doc 2: {section.get('doc2_value', 'N/A')}")

	output.append("")
	output.append("---")
	output.append("")

	return "\n".join(output)


	def generate_diff_html(text1: str, text2: str) -> str:
	"""
	Generate HTML diff highlighting differences between two texts.

	Args:
	text1: First text
	text2: Second text

	Returns:
	HTML string with diff highlighting
	"""
	# Split into words for better diff
	words1 = text1.split()
	words2 = text2.split()

	# Generate diff
	diff = difflib.ndiff(words1, words2)

	html_parts = []
	html_parts.append('<div style="font-family: monospace; line-height: 1.5;">')

	for item in diff:
	if item.startswith(' '): # Unchanged
	word = item[2:]
	html_parts.append(f'<span>{word} </span>')
	elif item.startswith('- '): # Removed from text1
	word = item[2:]
	html_parts.append(f'<span style="background-color: #ffcccc; text-decoration: line-through;">{word} </span>')
	elif item.startswith('+ '): # Added in text2
	word = item[2:]
	html_parts.append(f'<span style="background-color: #ccffcc;">{word} </span>')

	html_parts.append('</div>')

	return ''.join(html_parts)


	def create_score_legend() -> str:
	"""
	Create a legend explaining similarity scores.

	Returns:
	Markdown formatted legend
	"""
	legend = """
	### 📊 Similarity Score Guide

	- 90-100%: Nearly identical documents
	- 70-89%: Very similar with minor differences
	- 40-69%: Moderately similar with notable differences
	- 0-39%: Significantly different documents
	"""
	return legend