semantic-search / app.py
Testys's picture
Update app.py
a4cd871
raw
history blame
10.2 kB
import streamlit as st
from search_utils import SemanticSearch
import logging
import time
import os
import sys
import psutil # Added missing import
from urllib.parse import urlparse
import threading
import re
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler()
]
)
logger = logging.getLogger("SemanticSearchApp")
# Security validation functions
def is_valid_url(url):
"""Validate URL format and safety"""
try:
result = urlparse(url)
if not all([result.scheme, result.netloc]):
return False
# Add additional security checks here
return True
except:
return False
def sanitize_query(query):
"""Sanitize user input to prevent injection attacks"""
try:
# Remove non-alphanumeric characters except spaces and hyphens
clean_query = re.sub(r'[^\w\s-]', '', query)
return clean_query[:256] # Truncate to prevent long queries
except Exception as e:
logger.error(f"Query sanitization failed: {str(e)}")
return query[:256] # Fallback truncation
# Diagnostics integration
try:
from diagnostics import diagnose_parquet_files
diagnostics_available = True
except ImportError:
diagnostics_available = False
logger.warning("Diagnostics module not available")
def add_diagnostics_ui(search_system):
"""Enhanced diagnostics UI with proper directory checks"""
with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
if st.button("Run Full System Check"):
with st.spinner("Performing comprehensive system check..."):
# Create columns for organized display
col1, col2 = st.columns(2)
# Get actual paths from the search system
metadata_dir = search_system.metadata_mgr.shard_dir
faiss_dir = search_system.shard_dir # From SemanticSearch class
with col1:
# Metadata directory check
st.subheader("πŸ“‚ Metadata Validation")
if metadata_dir.exists():
# Check directory structure
dir_status = any(metadata_dir.glob("*.parquet"))
st.write(f"Directory: `{metadata_dir}`")
st.write(f"Parquet Files Found: {'βœ…' if dir_status else '❌'}")
# Check individual files
if diagnose_parquet_files(str(metadata_dir)):
st.success("βœ… Metadata shards valid")
else:
st.error("❌ Metadata issues detected")
else:
st.error("Metadata directory not found")
with col2:
# FAISS index check
st.subheader("πŸ“š FAISS Validation")
if faiss_dir.exists():
index_files = list(faiss_dir.glob("*.index"))
st.write(f"Directory: `{faiss_dir}`")
st.write(f"Index Files Found: {len(index_files)}")
if len(search_system.index_shards) > 0:
st.success(f"βœ… {len(search_system.index_shards)} FAISS shards loaded")
st.write(f"Total Vectors: {sum(s.ntotal for s in search_system.index_shards):,}")
else:
st.error("❌ No FAISS shards loaded")
else:
st.error("FAISS directory not found")
# System resource check
st.subheader("πŸ’» System Resources")
col_res1, col_res2 = st.columns(2)
with col_res1:
st.metric("Memory Usage",
f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
help="Current process memory usage")
with col_res2:
st.metric("CPU Utilization",
f"{psutil.cpu_percent()}%",
help="Total system CPU usage")
def main():
st.set_page_config(
page_title="Semantic Search Engine",
page_icon="πŸ”",
layout="wide"
)
# Initialize search system with enhanced caching
@st.cache_resource(ttl=3600, show_spinner="Initializing search engine...")
def init_search_system():
try:
system = SemanticSearch()
system.initialize_system()
logger.info("Search system initialized successfully")
return system
except Exception as e:
logger.error(f"System initialization failed: {str(e)}")
st.error("Critical system initialization error. Check logs.")
st.stop()
# Custom CSS with enhanced visual design
st.markdown("""
<style>
div[data-testid="stExpander"] div[role="button"] p {
font-size: 1.2rem;
font-weight: bold;
color: #1e88e5;
}
a.source-link {
color: #1a73e8 !important;
text-decoration: none !important;
border-bottom: 2px solid transparent;
transition: all 0.3s ease;
}
a.source-link:hover {
border-bottom-color: #1a73e8;
opacity: 0.9;
}
.similarity-badge {
padding: 0.2em 0.5em;
border-radius: 4px;
background: #e3f2fd;
color: #1e88e5;
font-weight: 500;
}
</style>
""", unsafe_allow_html=True)
try:
search_system = init_search_system()
except Exception as e:
st.error(f"Failed to initialize search system: {str(e)}")
st.stop()
# Main UI components
st.title("πŸ” Semantic Search Engine")
# Search input with sanitization
query = st.text_input("Enter your search query:",
placeholder="Search documents...",
max_chars=200)
if query:
try:
# Sanitize and validate query
clean_query = sanitize_query(query)
if not clean_query:
st.warning("Please enter a valid search query")
st.stop()
with st.spinner("πŸ” Searching through documents..."):
start_time = time.time()
results = search_system.search(clean_query, 5)
search_duration = time.time() - start_time
if not results.empty:
st.subheader(f"Top Results ({search_duration:.2f}s)")
# Visualize results with enhanced formatting
for _, row in results.iterrows():
with st.expander(f"{row['title']}"):
# Similarity visualization
col1, col2 = st.columns([3, 1])
with col1:
st.markdown(f"**Summary**: {row['summary']}")
with col2:
st.markdown(
f"<div class='similarity-badge'>"
f"Confidence: {row['similarity']:.1%}"
f"</div>",
unsafe_allow_html=True
)
st.progress(float(row['similarity']))
if row['source']:
st.markdown(row['source'], unsafe_allow_html=True)
else:
st.warning("Invalid source URL")
else:
st.warning("No matching documents found")
st.info("Try these tips:")
st.markdown("""
- Use more specific keywords
- Check your spelling
- Avoid special characters
""")
except Exception as e:
logger.error(f"Search failed: {str(e)}")
st.error("Search operation failed. Please try again.")
# System monitoring sidebar
with st.sidebar:
st.subheader("πŸ“Š System Status")
col1, col2 = st.columns(2)
with col1:
st.metric("Total Documents",
f"{search_system.metadata_mgr.total_docs:,}",
help="Total indexed documents in system")
with col2:
st.metric("FAISS Shards",
len(search_system.index_shards),
help="Number of loaded vector index shards")
st.metric("Active Memory",
f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
help="Current memory usage by the application")
# Diagnostics section
if diagnostics_available:
add_diagnostics_ui(search_system)
else:
st.warning("Diagnostics module not available")
# Health check with error handling
if st.button("🩺 Run Health Check"):
try:
system_stats = {
"shards_loaded": len(search_system.index_shards),
"metadata_records": search_system.metadata_mgr.total_docs,
"memory_usage": f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
"active_threads": threading.active_count(),
"system_load": f"{os.getloadavg()[0]:.2f}"
}
st.json(system_stats)
except Exception as e:
st.error(f"Health check failed: {str(e)}")
# Cache management
if st.button("♻️ Clear Cache"):
try:
st.cache_resource.clear()
st.rerun()
except Exception as e:
st.error(f"Cache clearance failed: {str(e)}")
if __name__ == "__main__":
main()