Spaces:

uumerrr684
/

Cosine_Similarity_Explainer

Runtime error

App Files Files Community

Cosine_Similarity_Explainer / app.py

uumerrr684

Update app.py

06d8f74 verified 9 months ago

raw

history blame contribute delete

14.4 kB

	import streamlit as st
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import requests
	import json
	import os

	# Page config
	st.set_page_config(
	page_title="Semantic Similarity Explainer",
	page_icon="🔍",
	layout="wide"
	)

	# Title and description
	st.title("🔍 Semantic Similarity Explainer with AI")
	st.markdown("""
	This app calculates the semantic similarity between two sentences using transformer-based embeddings (all-MiniLM-L6-v2) and uses AI to explain why that specific score makes sense.
	""")

	# Initialize session state
	if 'history' not in st.session_state:
	st.session_state.history = []

	# Cache the model loading
	@st.cache_resource
	def load_model():
	return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

	# Load the model
	with st.spinner("Loading transformer model..."):
	model = load_model()

	# Get API key from environment variables (Hugging Face Spaces secrets)
	api_key = os.getenv("OPENROUTER_API_KEY")

	# Sidebar for configuration
	with st.sidebar:
	st.header("⚙️ Configuration")

	if api_key:
	st.success("🟢 API Connected")
	else:
	st.error("❌ API Key not found in environment")
	api_key = st.text_input("OpenRouter API Key", type="password", help="Get your API key from https://openrouter.ai/keys")
	st.info("💡 Tip: Set OPENROUTER_API_KEY in Hugging Face Spaces secrets to avoid typing it every time")

	st.markdown("---")
	st.markdown("""
	### How it works:
	1. Enter two sentences
	2. Generate embeddings using transformer
	3. Calculate cosine similarity
	4. AI explains the similarity score
	5. View the full prompt sent to AI
	""")

	st.info("""
	Model: all-MiniLM-L6-v2

	This transformer model creates 384-dimensional embeddings that capture semantic meaning, not just word overlap.
	""")

	# Main content
	col1, col2 = st.columns(2)

	with col1:
	sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are hot")

	with col2:
	sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are cold")

	# Calculate button
	if st.button("🎯 Calculate & Explain", type="primary"):
	if not sentence1 or not sentence2:
	st.error("Please enter both sentences!")
	elif not api_key:
	st.error("Please enter your OpenRouter API key in the sidebar!")
	else:
	try:
	# Normalize to lowercase for consistency
	sentence1_normalized = sentence1.lower().strip()
	sentence2_normalized = sentence2.lower().strip()

	# Generate embeddings
	with st.spinner("Generating semantic embeddings..."):
	embeddings = model.encode([sentence1_normalized, sentence2_normalized])
	embedding1 = embeddings[0].reshape(1, -1)
	embedding2 = embeddings[1].reshape(1, -1)

	# Calculate cosine similarity
	similarity = cosine_similarity(embedding1, embedding2)[0][0]

	# Convert to Python float to fix the progress bar error
	similarity_float = float(similarity)
	similarity_rounded = round(similarity_float, 2)

	# Display similarity score
	st.success(f"Semantic similarity between:")
	st.info(f'"{sentence1}" and "{sentence2}" → {similarity_rounded:.2f}')

	# Show similarity meter (fixed the float32 error)
	if similarity_rounded < 0.3:
	similarity_desc = "Low similarity"
	elif similarity_rounded < 0.7:
	similarity_desc = "Moderate similarity"
	else:
	similarity_desc = "High similarity"

	# Convert to regular Python float for progress bar
	st.progress(float(similarity_rounded), text=similarity_desc)

	# Create a comprehensive prompt for the AI to explain WHY this specific score occurred
	detailed_prompt = f"""You are an expert in Natural Language Processing and semantic similarity analysis using transformer-based embeddings.

	I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which creates 384-dimensional vector embeddings that capture deep semantic meaning.

	ANALYSIS REQUEST:
	Sentence 1: "{sentence1}"
	Sentence 2: "{sentence2}"
	Cosine Similarity Score: {similarity_rounded:.2f}

	Please provide a detailed explanation of WHY these two specific sentences resulted in a similarity score of {similarity_rounded:.2f}.

	Your analysis should cover:

	1. Score Interpretation: What does {similarity_rounded:.2f} mean on the 0.00-1.00 scale? Is this low, moderate, or high similarity?

	2. Semantic Analysis:
	- What are the key semantic elements in each sentence?
	- What similarities did the transformer model detect?
	- What differences contributed to the score not being higher/lower?

	3. Linguistic Features:
	- Sentence structure patterns
	- Word relationships (synonyms, antonyms, related concepts)
	- Grammatical similarities
	- Contextual meaning

	4. Transformer Model Behavior:
	- How does all-MiniLM-L6-v2 process these sentences?
	- What semantic features likely contributed most to this score?
	- Why this score makes sense from a deep learning perspective

	5. Intuitive Validation: Does this {similarity_rounded:.2f} score match what a human would expect when comparing these sentences?

	Please be specific about these exact sentences and this exact score of {similarity_rounded:.2f}. Explain the reasoning behind this particular numerical result."""

	# Call OpenRouter API with the detailed prompt
	with st.spinner("🤖 AI is analyzing why you got this specific similarity score..."):
	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json",
	"HTTP-Referer": "https://github.com/semantic-similarity-app",
	"X-Title": "Semantic Similarity Explainer"
	}

	data = {
	"model": "openai/gpt-3.5-turbo",
	"messages": [
	{
	"role": "system",
	"content": "You are an NLP expert who explains similarity scores in simple, short terms that anyone can understand."
	},
	{
	"role": "user",
	"content": detailed_prompt
	}
	],
	"temperature": 0.10, # Slightly higher for more natural explanations
	"max_tokens": 400 # Much shorter responses
	}

	response = requests.post(
	"https://openrouter.ai/api/v1/chat/completions",
	headers=headers,
	json=data
	)

	if response.status_code == 200:
	result = response.json()
	explanation = result['choices'][0]['message']['content']

	# Display results in tabs
	tab1, tab2, tab3 = st.tabs(["🤖 AI Explanation", "📝 Prompt Sent to AI", "🔧 Technical Details"])

	with tab1:
	st.markdown("### 🧠 Why You Got This Similarity Score")
	st.markdown("AI Analysis:")

	# Create a nice container for the AI explanation
	with st.container():
	st.markdown(f"""
	<div style="background-color: #f0f2f6; padding: 20px; border-radius: 10px; border-left: 4px solid #1f77b4;">
	{explanation}
	</div>
	""", unsafe_allow_html=True)

	with tab2:
	st.markdown("### 📤 Exact Prompt Sent to GPT-3.5-Turbo")
	st.markdown("This is exactly what was sent to the AI to generate the explanation:")
	st.code(detailed_prompt, language="text")

	st.markdown("API Details:")
	st.json({
	"model": "openai/gpt-3.5-turbo",
	"temperature": 0.3,
	"max_tokens": 800,
	"system_message": "You are an expert NLP researcher..."
	})

	with tab3:
	st.markdown("### 🔧 Technical Details")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("Sentence 1 Analysis:")
	st.text(f"Original: {sentence1}")
	st.text(f"Normalized: {sentence1_normalized}")
	st.text(f"Embedding shape: {embedding1.shape}")
	st.text(f"Embedding L2 norm: {np.linalg.norm(embedding1):.4f}")

	st.markdown("First 10 embedding dimensions:")
	embedding_preview = embedding1[0][:10]
	for i, val in enumerate(embedding_preview):
	st.text(f"Dim {i}: {val:.4f}")

	with col2:
	st.markdown("Sentence 2 Analysis:")
	st.text(f"Original: {sentence2}")
	st.text(f"Normalized: {sentence2_normalized}")
	st.text(f"Embedding shape: {embedding2.shape}")
	st.text(f"Embedding L2 norm: {np.linalg.norm(embedding2):.4f}")

	st.markdown("First 10 embedding dimensions:")
	embedding_preview = embedding2[0][:10]
	for i, val in enumerate(embedding_preview):
	st.text(f"Dim {i}: {val:.4f}")

	st.markdown("---")
	st.markdown("Similarity Computation Details:")

	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("Embedding Dimensions", "384")
	st.metric("Exact Similarity", f"{similarity_float:.6f}")

	with col2:
	st.metric("Rounded Similarity", f"{similarity_rounded:.2f}")
	dot_product = np.dot(embedding1[0], embedding2[0])
	st.metric("Dot Product", f"{dot_product:.4f}")

	with col3:
	# Calculate angle between vectors
	angle = np.arccos(np.clip(similarity_float, -1.0, 1.0))
	angle_degrees = np.degrees(angle)
	st.metric("Vector Angle (degrees)", f"{angle_degrees:.2f}°")
	st.metric("Model Used", "all-MiniLM-L6-v2")

	# Save to history
	st.session_state.history.append({
	"sentence1": sentence1,
	"sentence2": sentence2,
	"similarity": similarity_rounded,
	"explanation": explanation
	})

	st.success("✅ Analysis complete! Check the tabs above for detailed explanations.")

	else:
	st.error(f"❌ API Error: {response.status_code}")
	st.error(f"Response: {response.text}")

	except Exception as e:
	st.error(f"❌ An error occurred: {str(e)}")
	st.error("Please check your API key and internet connection.")

	# Display history
	if st.session_state.history:
	st.markdown("---")
	st.markdown("### 📜 Previous Calculations")

	for i, item in enumerate(reversed(st.session_state.history[-5:])): # Show last 5
	with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' → Score: {item['similarity']:.2f}"):
	st.markdown(item['explanation'])

	# Info box about semantic similarity
	with st.expander("ℹ️ Understanding Semantic Similarity Scores"):
	st.markdown("""
	### How to Interpret Cosine Similarity Scores

	What the numbers mean:
	- 0.90 - 1.00: Nearly identical meaning (e.g., "The car is fast" vs "The automobile is quick")
	- 0.70 - 0.89: High semantic similarity (e.g., "I love dogs" vs "I adore puppies")
	- 0.50 - 0.69: Moderate similarity (e.g., "You are hot" vs "You are cold" - same structure, opposite meaning)
	- 0.30 - 0.49: Low similarity (e.g., "I like pizza" vs "Mathematics is difficult")
	- 0.00 - 0.29: Very low similarity (e.g., "Hello world" vs "Quantum physics equations")

	Why transformer embeddings are powerful:
	- They understand context and meaning, not just word overlap
	- They capture relationships between words (synonyms, antonyms, related concepts)
	- They consider sentence structure and grammatical patterns
	- They detect semantic similarity even with different words
	""")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center'>
	<p>🚀 Made with ❤️ using Streamlit \| Powered by Sentence Transformers & OpenRouter API</p>
	<p><small>Each calculation automatically sends your sentences and similarity score to GPT-3.5-turbo for detailed analysis</small></p>
	</div>
	""", unsafe_allow_html=True)