Spaces:
Sleeping
Sleeping
| """Streamlit demo interface for the Korean Q&A evaluation system.""" | |
| import streamlit as st | |
| import json | |
| import sys | |
| from pathlib import Path | |
| import logging | |
| import pandas as pd | |
| from typing import Dict, Any | |
| import plotly.graph_objects as go | |
| # Add src to path | |
| sys.path.append(str(Path(__file__).parent / "src")) | |
| from src.logger import setup_logging | |
| from src.config import Config | |
| from src.dataset_loader import DatasetLoader | |
| from src.evaluator import KoreanQAEvaluator | |
| from src.visualization import EvaluationVisualizer | |
| # Page config | |
| st.set_page_config( | |
| page_title="Korean Q&A Evaluation System", | |
| page_icon="๐ฐ๐ท", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| color: #1f77b4; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .metric-card { | |
| background-color: #f0f2f6; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| border-left: 4px solid #1f77b4; | |
| } | |
| .success-metric { | |
| border-left-color: #28a745; | |
| } | |
| .warning-metric { | |
| border-left-color: #ffc107; | |
| } | |
| .error-metric { | |
| border-left-color: #dc3545; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| class StreamlitDemo: | |
| """Streamlit demo interface for Korean Q&A evaluation.""" | |
| def __init__(self): | |
| self.visualizer = EvaluationVisualizer() | |
| # Initialize session state | |
| if 'config' not in st.session_state: | |
| st.session_state.config = self._load_config() | |
| if 'evaluator' not in st.session_state: | |
| st.session_state.evaluator = None | |
| if 'current_results' not in st.session_state: | |
| st.session_state.current_results = None | |
| def _load_config(self): | |
| """Load configuration.""" | |
| try: | |
| script_dir = Path(__file__).parent | |
| config_path = script_dir / "src" / "config.yaml" | |
| if config_path.exists(): | |
| return Config(str(config_path)) | |
| except Exception as e: | |
| st.error(f"Failed to load configuration: {e}") | |
| return None | |
| def _initialize_evaluator(self, api_key: str, threshold: float = 0.8): | |
| """Initialize the evaluator.""" | |
| try: | |
| if not api_key and st.session_state.config: | |
| api_key = st.session_state.config.google_api_key | |
| if not api_key: | |
| st.error("Please provide Google API key") | |
| return False | |
| model_name = st.session_state.config.gemini_model if st.session_state.config else "gemini-2.0-flash" | |
| st.session_state.evaluator = KoreanQAEvaluator( | |
| model_name=model_name, | |
| api_key=api_key, | |
| threshold=threshold, | |
| verbose_mode=True | |
| ) | |
| return True | |
| except Exception as e: | |
| st.error(f"Failed to initialize evaluator: {e}") | |
| return False | |
| def render_header(self): | |
| """Render the main header.""" | |
| st.markdown('<h1 class="main-header">๐ฐ๐ท Korean Q&A Evaluation System</h1>', unsafe_allow_html=True) | |
| st.markdown("---") | |
| # Info section | |
| with st.expander("โน๏ธ About this system"): | |
| st.markdown(""" | |
| This system evaluates Korean language question-answering models using: | |
| - **DeepEval Framework**: Advanced evaluation metrics | |
| - **Google Gemini Model**: State-of-the-art language model for evaluation | |
| - **Interactive Visualizations**: Real-time charts and analysis | |
| **Supported Metrics:** | |
| - Answer Relevancy: How relevant is the answer to the question? | |
| - Contextual Precision: How precise is the answer given the context? | |
| """) | |
| def render_sidebar(self): | |
| """Render the sidebar with configuration.""" | |
| st.sidebar.header("โ๏ธ Configuration") | |
| # API Key input | |
| api_key = st.sidebar.text_input( | |
| "Google API Key", | |
| type="password", | |
| help="Enter your Google API key for Gemini model access" | |
| ) | |
| # Threshold slider | |
| threshold = st.sidebar.slider( | |
| "Evaluation Threshold", | |
| min_value=0.0, | |
| max_value=1.0, | |
| value=0.8, | |
| step=0.1, | |
| help="Minimum score required to pass evaluation" | |
| ) | |
| # Model info | |
| if st.session_state.config: | |
| st.sidebar.info(f"Model: {st.session_state.config.gemini_model}") | |
| return api_key, threshold | |
| def render_single_evaluation(self, api_key: str, threshold: float): | |
| """Render single evaluation interface.""" | |
| st.header("๐ Single Question Evaluation") | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| input_text = st.text_area( | |
| "Input Question (Korean)", | |
| placeholder="์ด๋ฒ ๋ฌ ์ฐ๋ฆฌ ํ์ฌ ์ ์ฒด ๋งค์ถ์ ์ผ๋ง์ผ?", | |
| height=100 | |
| ) | |
| actual_output = st.text_area( | |
| "Actual Output (Korean)", | |
| placeholder="2025๋ 1์ ์ผ๊ด Global ์ ์ฒด ๋งค์ถ์ 335.4์ต์์ ๋๋ค...", | |
| height=150 | |
| ) | |
| if st.button("๐ Evaluate Single Question", type="primary"): | |
| if not input_text.strip() or not actual_output.strip(): | |
| st.error("Please provide both input and output text") | |
| return | |
| if not self._initialize_evaluator(api_key, threshold): | |
| return | |
| with st.spinner("Evaluating..."): | |
| try: | |
| results = st.session_state.evaluator.evaluate_single_case( | |
| input_text=input_text, | |
| actual_output=actual_output | |
| ) | |
| # Display results | |
| self._display_single_results(results) | |
| except Exception as e: | |
| st.error(f"Evaluation failed: {e}") | |
| with col2: | |
| st.info("๐ก **Tips:**\n\n- Enter Korean text for best results\n- Longer, more detailed answers typically score higher\n- The system evaluates relevance, not correctness") | |
| def render_dataset_evaluation(self, api_key: str, threshold: float): | |
| """Render dataset evaluation interface.""" | |
| st.header("๐ Dataset Evaluation") | |
| # File upload | |
| uploaded_file = st.file_uploader( | |
| "Upload Dataset CSV", | |
| type=['csv'], | |
| help="CSV file should have 'input' and 'expected_output' columns" | |
| ) | |
| if uploaded_file is not None: | |
| # Show preview | |
| try: | |
| df = pd.read_csv(uploaded_file) | |
| st.subheader("๐ Dataset Preview") | |
| st.dataframe(df.head(), use_container_width=True) | |
| col1, col2, col3 = st.columns([1, 1, 2]) | |
| with col1: | |
| st.metric("Total Rows", len(df)) | |
| with col2: | |
| st.metric("Columns", len(df.columns)) | |
| with col3: | |
| st.write("**Columns:**", ", ".join(df.columns.tolist())) | |
| except Exception as e: | |
| st.error(f"Error reading CSV file: {e}") | |
| return | |
| # Evaluation button | |
| if st.button("๐ Evaluate Dataset", type="primary"): | |
| if not self._initialize_evaluator(api_key, threshold): | |
| return | |
| with st.spinner("Evaluating dataset... This may take a while."): | |
| try: | |
| # Save uploaded file temporarily | |
| temp_path = "temp_dataset.csv" | |
| df.to_csv(temp_path, index=False) | |
| # Load and evaluate | |
| dataset_loader = DatasetLoader() | |
| dataset = dataset_loader.load_from_csv(temp_path) | |
| results = st.session_state.evaluator.evaluate_dataset(dataset) | |
| st.session_state.current_results = results | |
| # Display results | |
| self._display_dataset_results(results) | |
| # Clean up | |
| Path(temp_path).unlink(missing_ok=True) | |
| except Exception as e: | |
| st.error(f"Dataset evaluation failed: {e}") | |
| def _display_single_results(self, results: Dict[str, Any]): | |
| """Display single evaluation results.""" | |
| st.subheader("๐ Evaluation Results") | |
| if results.get('detailed_results'): | |
| result = results['detailed_results'][0] | |
| # Metrics display | |
| metrics = result.get('metrics', {}) | |
| if metrics: | |
| cols = st.columns(len(metrics)) | |
| for i, (metric_name, metric_data) in enumerate(metrics.items()): | |
| with cols[i]: | |
| score = metric_data.get('score', 0) | |
| passed = metric_data.get('passed', False) | |
| # Color based on pass/fail | |
| if passed: | |
| st.markdown(f'<div class="metric-card success-metric">', unsafe_allow_html=True) | |
| st.metric(metric_name, f"{score:.4f}", "โ PASS") | |
| else: | |
| st.markdown(f'<div class="metric-card error-metric">', unsafe_allow_html=True) | |
| st.metric(metric_name, f"{score:.4f}", "โ FAIL") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Visualizations | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig = self.visualizer.create_score_histogram(results) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| fig = self.visualizer.create_pass_fail_pie_chart(results) | |
| st.plotly_chart(fig, use_container_width=True) | |
| def _display_dataset_results(self, results: Dict[str, Any]): | |
| """Display dataset evaluation results.""" | |
| st.subheader("๐ Dataset Evaluation Results") | |
| # Summary metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.markdown('<div class="metric-card">', unsafe_allow_html=True) | |
| st.metric("Total Cases", results.get('total_cases', 0)) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with col2: | |
| passed = results.get('passed_cases', 0) | |
| st.markdown('<div class="metric-card success-metric">', unsafe_allow_html=True) | |
| st.metric("Passed", passed) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with col3: | |
| failed = results.get('failed_cases', 0) | |
| st.markdown('<div class="metric-card error-metric">', unsafe_allow_html=True) | |
| st.metric("Failed", failed) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with col4: | |
| pass_rate = results.get('pass_rate', 0) | |
| color_class = "success-metric" if pass_rate >= 80 else "warning-metric" if pass_rate >= 60 else "error-metric" | |
| st.markdown(f'<div class="metric-card {color_class}">', unsafe_allow_html=True) | |
| st.metric("Pass Rate", f"{pass_rate:.1f}%") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Additional metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Average Score", f"{results.get('average_score', 0):.4f}") | |
| with col2: | |
| st.metric("Threshold", results.get('threshold', 0.8)) | |
| with col3: | |
| st.metric("Model", results.get('model_name', 'N/A')) | |
| # Visualizations | |
| st.subheader("๐ Detailed Analysis") | |
| # First row of charts | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig = self.visualizer.create_score_histogram(results) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| fig = self.visualizer.create_pass_fail_pie_chart(results) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Second row of charts | |
| fig = self.visualizer.create_metrics_comparison(results) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Third row | |
| fig = self.visualizer.create_score_vs_length_scatter(results) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Summary table | |
| fig = self.visualizer.create_summary_stats_table(results) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Download results | |
| if st.button("๐ฅ Download Results JSON"): | |
| json_str = json.dumps(results, ensure_ascii=False, indent=2) | |
| st.download_button( | |
| label="Download JSON", | |
| data=json_str, | |
| file_name="evaluation_results.json", | |
| mime="application/json" | |
| ) | |
| def render_sample_data_tab(self): | |
| """Render sample data information.""" | |
| st.header("๐ Sample Data Format") | |
| st.markdown(""" | |
| ### CSV Format Requirements | |
| Your dataset CSV file should have the following columns: | |
| - `input`: The question or input text (Korean) | |
| - `expected_output`: The expected answer or output text (Korean) | |
| """) | |
| # Sample data | |
| sample_data = { | |
| 'input': [ | |
| '์ด๋ฒ ๋ฌ ์ฐ๋ฆฌ ํ์ฌ ์ ์ฒด ๋งค์ถ์ ์ผ๋ง์ผ?', | |
| '์ฌ์ ๋ถ๋ณ ๋งค์ถ ๋น์ค์ด ์ด๋ป๊ฒ ๋๋์?', | |
| '์ต๊ทผ ์์จ์ด ๋ฎ์ ๊ณต์ ์ด ์๋์?' | |
| ], | |
| 'expected_output': [ | |
| '2025๋ 1์ ์ผ๊ด Global ์ ์ฒด ๋งค์ถ์ 335.4์ต์์ ๋๋ค.', | |
| 'ํ๊ตญ ์ฌ์ ๋ถ: 213.0์ต์ (39.7%), ๋ฒ ํธ๋จ ์ฌ์ ๋ถ: 38.6์ต์ (44.1%)', | |
| 'R47 ENCLOSURE ์ฌ์ถ: 59%, R47 ARM ์ฌ์ถ: 80% ๋ฑ์ด 90% ๋ฏธ๋ง์ ๋๋ค.' | |
| ] | |
| } | |
| sample_df = pd.DataFrame(sample_data) | |
| st.subheader("Sample Data") | |
| st.dataframe(sample_df, use_container_width=True) | |
| # Download sample | |
| csv = sample_df.to_csv(index=False) | |
| st.download_button( | |
| label="๐ฅ Download Sample CSV", | |
| data=csv, | |
| file_name="sample_korean_qa.csv", | |
| mime="text/csv" | |
| ) | |
| def run(self): | |
| """Run the Streamlit app.""" | |
| self.render_header() | |
| # Sidebar | |
| api_key, threshold = self.render_sidebar() | |
| # Main tabs | |
| tab1, tab2, tab3 = st.tabs(["๐ Single Evaluation", "๐ Dataset Evaluation", "๐ Sample Data"]) | |
| with tab1: | |
| self.render_single_evaluation(api_key, threshold) | |
| with tab2: | |
| self.render_dataset_evaluation(api_key, threshold) | |
| with tab3: | |
| self.render_sample_data_tab() | |
| def main(): | |
| """Main function to run the Streamlit app.""" | |
| demo = StreamlitDemo() | |
| demo.run() | |
| if __name__ == "__main__": | |
| main() |