Spaces:
Sleeping
Sleeping
Last commit not found
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import json | |
import os | |
import time | |
import zipfile | |
from sentence_transformers import SentenceTransformer, util | |
from loguru import logger | |
# ================== CONFIGURATION ================== | |
st.set_page_config( | |
page_title="Problem Deduplication Explorer", | |
layout="wide", | |
initial_sidebar_state="expanded" | |
) | |
# Initialize session state | |
if 'page_number' not in st.session_state: | |
st.session_state.page_number = 0 | |
if 'analysis_results' not in st.session_state: | |
st.session_state.analysis_results = None | |
if 'filtered_results' not in st.session_state: | |
st.session_state.filtered_results = None | |
# Load a pre-trained model for embeddings with HF caching | |
def load_model(): | |
model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
try: | |
return SentenceTransformer(model_name, cache_folder="/tmp/sentence_transformers") | |
except Exception as e: | |
st.error(f"Error loading model: {e}") | |
return None | |
model = load_model() | |
# Load preloaded dataset | |
def load_data(): | |
try: | |
file_path = "data/merged_dataset.csv.zip" | |
with zipfile.ZipFile(file_path, 'r') as zip_ref: | |
with zip_ref.open(zip_ref.namelist()[0]) as file: | |
df = pd.read_csv(file) | |
return df[["uuid", "problem", "source", "question_type", "problem_type"]] | |
except Exception as e: | |
st.error(f"Error loading dataset: {e}") | |
return pd.DataFrame(columns=["uuid", "problem", "source", "question_type", "problem_type"]) | |
# Cache embeddings computation with error handling | |
def compute_embeddings(problems): | |
"""Compute and cache sentence embeddings.""" | |
try: | |
return model.encode(problems, normalize_embeddings=True) | |
except Exception as e: | |
st.error(f"Error computing embeddings: {e}") | |
return np.array([]) | |
def find_similar_problems(df, similarity_threshold=0.9, progress_bar=None): | |
"""Find similar problems using cosine similarity, optimized for speed.""" | |
if df.empty: | |
return [] | |
embeddings = compute_embeddings(df['problem'].tolist()) | |
if embeddings.size == 0: | |
return [] | |
if progress_bar: | |
progress_bar.progress(0.33, "Computing similarity matrix...") | |
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy() | |
if progress_bar: | |
progress_bar.progress(0.66, "Finding similar pairs...") | |
num_problems = len(df) | |
upper_triangle_indices = np.triu_indices(num_problems, k=1) | |
similarity_scores = similarity_matrix[upper_triangle_indices] | |
mask = similarity_scores > similarity_threshold | |
filtered_indices = np.where(mask)[0] | |
pairs = [ | |
(df.iloc[upper_triangle_indices[0][i]]["uuid"], | |
df.iloc[upper_triangle_indices[1][i]]["uuid"], | |
float(similarity_scores[i])) | |
for i in filtered_indices | |
] | |
if progress_bar: | |
progress_bar.progress(1.0, "Analysis complete!") | |
time.sleep(0.5) | |
progress_bar.empty() | |
return sorted(pairs, key=lambda x: x[2], reverse=True) | |
def analyze_clusters(_df, pairs): | |
"""Analyze duplicate problem clusters with caching.""" | |
if not pairs or _df.empty: | |
return [] | |
detailed_analysis = [] | |
for base_uuid, comp_uuid, score in pairs: | |
base_row = _df[_df["uuid"] == base_uuid].iloc[0] | |
comp_row = _df[_df["uuid"] == comp_uuid].iloc[0] | |
column_differences = { | |
col: { | |
'base': base_row[col], | |
'comparison': comp_row[col], | |
'match': bool(base_row[col] == comp_row[col]) | |
} | |
for col in _df.columns if col != "uuid" | |
} | |
detailed_analysis.append({ | |
'base_uuid': base_uuid, | |
'comp_uuid': comp_uuid, | |
'similarity_score': score, | |
'column_differences': column_differences, | |
}) | |
return detailed_analysis | |
def apply_filters(results, df, selected_source, selected_qtype): | |
"""Apply filters to results.""" | |
filtered = results.copy() | |
if selected_source: | |
filtered = [r for r in filtered if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source] | |
if selected_qtype: | |
filtered = [r for r in filtered if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype] | |
return filtered | |
def main(): | |
st.title("π Problem Deduplication Explorer") | |
if model is None: | |
st.error("Failed to load the model. Please try again later.") | |
return | |
# Sidebar configuration | |
with st.sidebar: | |
st.header("Settings") | |
similarity_threshold = st.slider( | |
"Similarity Threshold", | |
min_value=0.5, | |
max_value=1.0, | |
value=0.9, | |
step=0.01, | |
help="Higher values mean more similar problems" | |
) | |
items_per_page = st.select_slider( | |
"Items per page", | |
options=[5, 10, 20, 50], | |
value=10, | |
help="Number of results to show per page" | |
) | |
# Load and display dataset | |
df = load_data() | |
if df.empty: | |
st.error("Failed to load the dataset. Please check if the data file exists in the correct location.") | |
return | |
with st.expander("π Dataset Preview", expanded=False): | |
st.dataframe( | |
df.head(), | |
use_container_width=True, | |
hide_index=True | |
) | |
# Analysis section | |
if st.sidebar.button("Run Deduplication Analysis", type="primary") or st.session_state.analysis_results is not None: | |
if st.session_state.analysis_results is None: | |
progress_bar = st.progress(0, "Starting analysis...") | |
pairs = find_similar_problems(df, similarity_threshold, progress_bar) | |
st.session_state.analysis_results = analyze_clusters(df, pairs) | |
results = st.session_state.analysis_results | |
if not results: | |
st.warning("No similar problems found with the current threshold.") | |
return | |
# Filtering options | |
sources = sorted(df["source"].unique().tolist()) | |
question_types = sorted(df["question_type"].unique().tolist()) | |
col1, col2 = st.columns(2) | |
with col1: | |
selected_source = st.selectbox("Filter by Source", [None] + sources) | |
with col2: | |
selected_qtype = st.selectbox("Filter by Question Type", [None] + question_types) | |
# Apply filters and store in session state | |
filtered_results = apply_filters(results, df, selected_source, selected_qtype) | |
st.session_state.filtered_results = filtered_results | |
if not filtered_results: | |
st.warning("No results found with the current filters.") | |
return | |
# Pagination | |
total_pages = (len(filtered_results) - 1) // items_per_page | |
st.session_state.page_number = min(st.session_state.page_number, total_pages) | |
col1, col2, col3 = st.columns([1, 3, 1]) | |
with col1: | |
if st.button("β Previous", disabled=st.session_state.page_number <= 0): | |
st.session_state.page_number -= 1 | |
with col2: | |
st.write(f"Page {st.session_state.page_number + 1} of {total_pages + 1}") | |
with col3: | |
if st.button("Next β", disabled=st.session_state.page_number >= total_pages): | |
st.session_state.page_number += 1 | |
# Display results | |
start_idx = st.session_state.page_number * items_per_page | |
end_idx = start_idx + items_per_page | |
page_results = filtered_results[start_idx:end_idx] | |
for entry in page_results: | |
with st.container(): | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
st.markdown("### Original Problem") | |
st.info(df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]) | |
with col2: | |
st.markdown("### Similar Problem") | |
st.info(df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]) | |
st.metric("Similarity Score", f"{entry['similarity_score']:.4f}") | |
with st.expander("Show Details"): | |
st.json(entry["column_differences"]) | |
st.markdown("---") | |
if __name__ == "__main__": | |
main() |