Varshith dharmaj
Robust MVM2 System Sync: Fixed Imports and Restored Services
b25b8f2 verified
"""
Streamlit Dashboard for Mathematical Reasoning Verification System
Interactive UI with real-time processing logs and results display
"""
import streamlit as st
import time
from typing import List, Dict, Any
from core import run_verification_parallel
# Initialize session state
if 'steps_log' not in st.session_state:
st.session_state.steps_log = []
if 'results' not in st.session_state:
st.session_state.results = None
def add_log(step: str, model: str, status: str, details: str):
"""Add entry to processing log."""
log_entry = {
"step": step,
"model": model,
"status": status,
"details": details,
"timestamp": time.time()
}
st.session_state.steps_log.append(log_entry)
def display_flowchart(problem="", steps_input=""):
"""Display interactive flowchart with expandable explanations."""
# Check if we have results to show status
has_results = st.session_state.results is not None
results = st.session_state.results if has_results else None
# Parse steps if provided
steps = []
if steps_input:
steps = [s.strip() for s in steps_input.split('\n') if s.strip()]
elif has_results:
steps = results.get('steps', [])
# Problem-Specific Flowchart
if problem or steps:
st.markdown("### ๐Ÿ“Š Problem Flowchart")
st.markdown("**Problem:**")
st.info(problem if problem else "No problem entered yet")
if steps:
st.markdown("**Solution Flow:**")
# Create flowchart for the actual problem
flowchart_lines = []
flowchart_lines.append("```")
# Format problem text to fit in box
problem_display = problem[:45] + "..." if len(problem) > 45 else problem
problem_display = problem_display.ljust(50)
flowchart_lines.append("โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”")
flowchart_lines.append(f"โ”‚ ๐Ÿ“ฅ PROBLEM: {problem_display} โ”‚")
flowchart_lines.append("โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜")
flowchart_lines.append(" โ”‚")
flowchart_lines.append(" โ–ผ")
for i, step in enumerate(steps, 1):
# Extract key info from step
step_short = step[:45] + "..." if len(step) > 45 else step
# Check if this step has an error (if results available)
has_error = False
if has_results:
classified_errors = results.get('classified_errors', [])
for error in classified_errors:
if error.get('step_number') == i:
has_error = True
break
# Determine status icon
status_icon = "โŒ" if has_error else "โœ…"
# Format step text to fit in box (max 45 chars)
step_display = step_short.ljust(45)
if i < len(steps):
flowchart_lines.append(f"โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”")
flowchart_lines.append(f"โ”‚ {status_icon} STEP {i}: {step_display} โ”‚")
flowchart_lines.append("โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜")
flowchart_lines.append(" โ”‚")
flowchart_lines.append(" โ–ผ")
else:
flowchart_lines.append(f"โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”")
flowchart_lines.append(f"โ”‚ {status_icon} STEP {i}: {step_display} โ”‚")
flowchart_lines.append("โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜")
flowchart_lines.append(" โ”‚")
flowchart_lines.append(" โ–ผ")
flowchart_lines.append("โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”")
flowchart_lines.append("โ”‚ ๐Ÿ“ค FINAL ANSWER โ”‚")
if has_results:
consensus = results.get('consensus', {})
final_verdict = consensus.get('final_verdict', 'UNKNOWN')
verdict_icon = "โŒ ERROR" if final_verdict == "ERROR" else "โœ… VALID"
verdict_display = verdict_icon.ljust(55)
flowchart_lines.append(f"โ”‚ {verdict_display} โ”‚")
flowchart_lines.append("โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜")
flowchart_lines.append("```")
flowchart_text = "\n".join(flowchart_lines)
st.code(flowchart_text, language=None)
# Show step details
st.markdown("**Step Details:**")
for i, step in enumerate(steps, 1):
# Check for errors in this step
step_errors = []
if has_results:
classified_errors = results.get('classified_errors', [])
step_errors = [e for e in classified_errors if e.get('step_number') == i]
if step_errors:
with st.expander(f"Step {i}: {step[:60]}{'...' if len(step) > 60 else ''} โŒ", expanded=False):
st.write(f"**Full Step:** {step}")
for error in step_errors:
st.error(f"**Error Found:** {error.get('category', 'Unknown')}")
st.write(f"- Found: `{error.get('found', 'N/A')}`")
st.write(f"- Correct: `{error.get('correct', 'N/A')}`")
explanations = results.get('explanations', {})
if i in explanations:
st.info(f"**Explanation:** {explanations[i]}")
else:
with st.expander(f"Step {i}: {step[:60]}{'...' if len(step) > 60 else ''} โœ…", expanded=False):
st.write(f"**Full Step:** {step}")
st.success("No errors detected in this step")
else:
st.info("Enter solution steps to see the problem flowchart")
st.markdown("---")
# Step 1: INPUT
with st.expander("๐Ÿ“ฅ **STEP 1: INPUT** - Problem & Solution Steps", expanded=True):
st.markdown("""
**What happens here?**
- The system receives the math problem and step-by-step solution
- Input is validated and prepared for processing
- Steps are parsed and segmented for analysis
""")
if problem:
st.success(f"โœ… Received problem: {problem}")
if steps:
st.success(f"โœ… Received {len(steps)} solution steps")
if has_results:
st.code(f"Problem: {results.get('problem', '')[:100]}...")
# Step 2: PARSING
with st.expander("๐Ÿ” **STEP 2: PARSING** - Extract Mathematical Expressions", expanded=has_results):
st.markdown("""
**What happens here?**
- Mathematical expressions are extracted using regex patterns
- Operations (+, -, *, /) are identified
- Numbers and variables are recognized
- Each step is prepared for verification
""")
if has_results:
steps = results.get('steps', [])
st.success(f"โœ… Parsed {len(steps)} steps")
for i, step in enumerate(steps[:3], 1):
st.write(f" Step {i}: {step[:60]}...")
if len(steps) > 3:
st.write(f" ... and {len(steps) - 3} more steps")
# Step 3: PARALLEL EXECUTION
with st.expander("๐Ÿ”„ **STEP 3: PARALLEL EXECUTION** - 3 Models Running Simultaneously", expanded=has_results):
st.markdown("""
**What happens here?**
- **Model 1 (Symbolic) ๐Ÿ”ข**: Uses SymPy to verify all arithmetic calculations
- Weight: 40% (most reliable for math)
- Not affected by sidebar selection
- **Model 2 (LLM Logical) ๐Ÿง **: Checks for logical consistency and contradictions
- Weight: 35%
- Uses first selected model from sidebar (e.g., GPT-4)
- Currently: Pattern-based simulation
- **Model 3 (Ensemble) ๐Ÿค–**: Simulates multiple LLMs voting on solution validity
- Weight: 25%
- Uses ALL selected models from sidebar (GPT-4, Llama 2, Gemini)
- Each model votes, majority wins
- Currently: Pattern-based simulation
All three models run **in parallel** using ThreadPoolExecutor for speed!
""")
if has_results:
model_results = results.get('model_results', {})
col1, col2, col3 = st.columns(3)
with col1:
if 'symbolic' in model_results:
verdict = model_results['symbolic']['verdict']
conf = model_results['symbolic']['confidence'] * 100
errors = len(model_results['symbolic'].get('errors', []))
if verdict == "ERROR":
st.error(f"**๐Ÿ”ข Symbolic**\n\nโŒ {verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
else:
st.success(f"**๐Ÿ”ข Symbolic**\n\nโœ… {verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
with col2:
if 'llm_logical' in model_results:
verdict = model_results['llm_logical']['verdict']
conf = model_results['llm_logical']['confidence'] * 100
errors = len(model_results['llm_logical'].get('errors', []))
if verdict == "ERROR":
st.error(f"**๐Ÿง  LLM Logical**\n\nโŒ {verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
else:
st.success(f"**๐Ÿง  LLM Logical**\n\nโœ… {verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
with col3:
if 'ensemble' in model_results:
verdict = model_results['ensemble']['verdict']
conf = model_results['ensemble']['confidence'] * 100
agreement = model_results['ensemble'].get('agreement', 'N/A')
if verdict == "ERROR":
st.error(f"**๐Ÿค– Ensemble**\n\nโŒ {verdict}\n\nConfidence: {conf:.1f}%\n\nAgreement: {agreement}")
else:
st.success(f"**๐Ÿค– Ensemble**\n\nโœ… {verdict}\n\nConfidence: {conf:.1f}%\n\nAgreement: {agreement}")
else:
st.info("โณ Models will execute in parallel when you click 'Verify Solution'")
# Step 4: CONSENSUS
with st.expander("โš–๏ธ **STEP 4: CONSENSUS** - Weighted Voting Mechanism", expanded=has_results):
st.markdown("""
**What happens here?**
- The system combines results from all 3 models using **weighted voting**:
- **Symbolic Model**: 40% weight (most reliable for arithmetic)
- **LLM Logical Model**: 35% weight (good for reasoning)
- **Ensemble Model**: 25% weight (provides diversity)
- An **error score** is calculated: if > 0.50, verdict = ERROR
- **Confidence** is adjusted based on agreement:
- All 3 agree: confidence boosted by 10%
- 2/3 agree: uses average of agreeing models
- Mixed: confidence penalized by 20%
""")
if has_results:
consensus = results.get('consensus', {})
final_verdict = consensus.get('final_verdict', 'UNKNOWN')
overall_conf = consensus.get('overall_confidence', 0) * 100
error_score = consensus.get('error_score', 0)
agreement = consensus.get('agreement_type', 'UNKNOWN')
st.markdown(f"""
**Consensus Results:**
- **Final Verdict**: {'โŒ ERROR' if final_verdict == 'ERROR' else 'โœ… VALID'}
- **Overall Confidence**: {overall_conf:.1f}%
- **Error Score**: {error_score:.3f} (threshold: 0.50)
- **Agreement Type**: {agreement}
""")
# Show individual model contributions
st.markdown("**Model Contributions:**")
individual_verdicts = consensus.get('individual_verdicts', {})
individual_confidences = consensus.get('individual_confidences', {})
weights = {"symbolic": 0.40, "llm_logical": 0.35, "ensemble": 0.25}
for model_name, verdict in individual_verdicts.items():
weight = weights.get(model_name, 0)
confidence = individual_confidences.get(model_name, 0) * 100
contribution = weight * individual_confidences.get(model_name, 0)
st.write(f" - **{model_name.title()}**: {verdict} ({confidence:.1f}% confidence) โ†’ {weight*100:.0f}% weight โ†’ contributes {contribution:.3f}")
# Step 5: ERROR CLASSIFICATION
with st.expander("๐Ÿท๏ธ **STEP 5: ERROR CLASSIFICATION** - Categorize & Analyze Errors", expanded=has_results and len(results.get('classified_errors', [])) > 0):
st.markdown("""
**What happens here?**
- Each detected error is classified into one of 10+ error types:
- Arithmetic Error (calculation mistakes)
- Logical Error (contradictions)
- Operation Mismatch (says one thing, does another)
- Semantic Error (meaning doesn't match)
- And more...
- **Severity** is assigned: HIGH, MEDIUM, or LOW
- **Fixability** is assessed: can the error be auto-corrected?
""")
if has_results:
classified_errors = results.get('classified_errors', [])
if classified_errors:
st.success(f"โœ… Classified {len(classified_errors)} error(s)")
for error in classified_errors[:3]:
st.markdown(f"""
**Error in Step {error.get('step_number', '?')}:**
- **Category**: {error.get('category', 'Unknown')}
- **Severity**: {error.get('severity', 'Unknown')}
- **Fixable**: {'Yes' if error.get('fixable', False) else 'No'}
- **Fixability Score**: {error.get('fixability_score', 0)*100:.0f}%
""")
else:
st.info("โœ… No errors found - solution is valid!")
# Step 6: EXPLANATION GENERATION
with st.expander("๐Ÿ’ฌ **STEP 6: EXPLANATION GENERATION** - Create Human-Readable Explanations", expanded=has_results and len(results.get('explanations', {})) > 0):
st.markdown("""
**What happens here?**
- For each error, a natural language explanation is generated
- Explains **why** the error occurred
- Provides educational context
- Suggests how to avoid similar mistakes
- Includes learning tips
""")
if has_results:
explanations = results.get('explanations', {})
if explanations:
st.success(f"โœ… Generated {len(explanations)} explanation(s)")
for step_num, explanation in list(explanations.items())[:2]:
with st.container():
st.markdown(f"**Step {step_num} Explanation:**")
st.info(explanation)
else:
st.info("โœ… No explanations needed - solution is correct!")
# Step 7: ERROR CORRECTION
with st.expander("๐Ÿ”ง **STEP 7: ERROR CORRECTION** - Automatic Fixes", expanded=has_results and results.get('correction', {}).get('fixed_count', 0) > 0):
st.markdown("""
**What happens here?**
- Fixable errors are automatically corrected
- Arithmetic errors: correct values are calculated and replaced
- Operation mismatches: operations are corrected
- Success rate is tracked for each error type
- Errors requiring manual review are flagged
""")
if has_results:
correction = results.get('correction', {})
fixed_count = correction.get('fixed_count', 0)
if fixed_count > 0:
st.success(f"โœ… Fixed {fixed_count} error(s)")
st.write(f"**Success Rate**: {correction.get('success_rate', 0)*100:.1f}%")
correction_log = correction.get('correction_log', [])
if correction_log:
for log_entry in correction_log[:2]:
st.markdown(f"""
**Step {log_entry['step']} ({log_entry['type']}):**
- Original: `{log_entry['original']}`
- Corrected: `{log_entry['corrected']}`
""")
else:
st.info("โœ… No corrections needed")
# Step 8: OUTPUT
with st.expander("๐Ÿ“ค **STEP 8: OUTPUT** - Final Results", expanded=has_results):
st.markdown("""
**What happens here?**
- Final verdict is displayed (VALID or ERROR)
- Overall confidence score is shown
- All errors with explanations are presented
- Processing time is reported
- Results are ready for review
""")
if has_results:
consensus = results.get('consensus', {})
final_verdict = consensus.get('final_verdict', 'UNKNOWN')
overall_conf = consensus.get('overall_confidence', 0) * 100
processing_time = results.get('processing_time', 0)
total_errors = len(results.get('classified_errors', []))
if final_verdict == "ERROR":
st.error(f"**Final Verdict**: โŒ {final_verdict}")
else:
st.success(f"**Final Verdict**: โœ… {final_verdict}")
st.metric("Overall Confidence", f"{overall_conf:.1f}%")
st.metric("Processing Time", f"{processing_time:.3f}s")
st.metric("Total Errors Found", total_errors)
st.success("โœ… Verification complete! Results displayed above.")
else:
st.info("โณ Results will appear here after verification")
# Visual flowchart diagram
st.markdown("---")
st.markdown("### ๐Ÿ“Š Processing Flow Diagram")
st.markdown("""
```
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ๐Ÿ“ฅ INPUT โ”‚
โ”‚ Problem + Solution Steps โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ๐Ÿ” PARSING โ”‚
โ”‚ Extract expressions, identify operations โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ๐Ÿ”„ PARALLEL EXECUTION โ”‚
โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚
โ”‚ โ”‚ Symbolic โ”‚ โ”‚ LLM โ”‚ โ”‚ Ensemble โ”‚ โ”‚
โ”‚ โ”‚ (SymPy) โ”‚ โ”‚ Logical โ”‚ โ”‚ (Voting)โ”‚ โ”‚
โ”‚ โ”‚ 40% โ”‚ โ”‚ 35% โ”‚ โ”‚ 25% โ”‚ โ”‚
โ”‚ โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚ โ”‚ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ โš–๏ธ CONSENSUS โ”‚
โ”‚ Weighted Voting โ†’ Final Verdict & Confidence โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ๐Ÿท๏ธ ERROR CLASSIFICATION โ”‚
โ”‚ Categorize โ†’ Severity โ†’ Fixability โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ๐Ÿ’ฌ EXPLANATION GENERATION โ”‚
โ”‚ Natural language explanations for each error โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ๐Ÿ”ง ERROR CORRECTION โ”‚
โ”‚ Auto-fix fixable errors โ†’ Track success rate โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ๐Ÿ“ค OUTPUT โ”‚
โ”‚ Final Verdict + Confidence + All Details โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
```""")
def display_logs():
"""Display processing logs with color coding."""
if not st.session_state.steps_log:
return
st.subheader("๐Ÿ“Š Processing Flow")
for log_entry in st.session_state.steps_log:
status = log_entry["status"]
step = log_entry["step"]
model = log_entry["model"]
details = log_entry["details"]
# Color coding based on status
if status.startswith("โœ“"):
st.success(f"**{status}** [{model}] {step}: {details}")
elif status.startswith("โŒ"):
st.error(f"**{status}** [{model}] {step}: {details}")
elif status.startswith("โš ๏ธ"):
st.warning(f"**{status}** [{model}] {step}: {details}")
else:
st.info(f"**{status}** [{model}] {step}: {details}")
def display_results():
"""Display verification results."""
if not st.session_state.results:
return
results = st.session_state.results
st.header("๐ŸŽฏ Results")
# Final verdict, confidence, processing time
col1, col2, col3 = st.columns(3)
consensus = results.get("consensus", {})
final_verdict = consensus.get("final_verdict", "UNKNOWN")
overall_confidence = consensus.get("overall_confidence", 0.0)
processing_time = results.get("processing_time", 0.0)
with col1:
if final_verdict == "ERROR":
st.error(f"**Final Verdict:** โŒ {final_verdict}")
else:
st.success(f"**Final Verdict:** โœ… {final_verdict}")
with col2:
st.metric("**Confidence**", f"{overall_confidence * 100:.1f}%")
with col3:
st.metric("**Processing Time**", f"{processing_time:.2f}s")
# Model verdicts
st.subheader("๐Ÿค– Model Verdicts")
model_results = results.get("model_results", {})
cols = st.columns(3)
model_names = ["symbolic", "llm_logical", "ensemble"]
model_display_names = {
"symbolic": "๐Ÿ”ข Symbolic",
"llm_logical": "๐Ÿง  LLM Logical",
"ensemble": "๐Ÿค– Ensemble"
}
for idx, model_name in enumerate(model_names):
with cols[idx]:
if model_name in model_results:
model_result = model_results[model_name]
verdict = model_result.get("verdict", "UNKNOWN")
confidence = model_result.get("confidence", 0.0)
errors_count = len(model_result.get("errors", []))
if verdict == "ERROR":
st.error(f"**{model_display_names[model_name]}**\n\nVerdict: โŒ {verdict}\n\nConfidence: {confidence * 100:.1f}%\n\nErrors: {errors_count}")
else:
st.success(f"**{model_display_names[model_name]}**\n\nVerdict: โœ… {verdict}\n\nConfidence: {confidence * 100:.1f}%\n\nErrors: {errors_count}")
# Consensus mechanism breakdown
st.subheader("โš–๏ธ Consensus Mechanism")
agreement_type = consensus.get("agreement_type", "UNKNOWN")
error_score = consensus.get("error_score", 0.0)
individual_verdicts = consensus.get("individual_verdicts", {})
individual_confidences = consensus.get("individual_confidences", {})
st.info(f"**Agreement:** {agreement_type}")
st.info(f"**Error Score:** {error_score:.3f} (threshold: 0.50)")
st.write("**Individual Model Results:**")
for model_name, verdict in individual_verdicts.items():
confidence = individual_confidences.get(model_name, 0.0)
st.write(f"- {model_display_names.get(model_name, model_name)}: {verdict} (confidence: {confidence * 100:.1f}%)")
# Error details
classified_errors = results.get("classified_errors", [])
if classified_errors:
st.subheader("๐Ÿ”ด Error Details")
for error in classified_errors:
with st.expander(f"Error in Step {error.get('step_number', 0)}: {error.get('category', 'Unknown')}"):
st.write(f"**Type:** {error.get('type', 'unknown')}")
st.write(f"**Found:** {error.get('found', 'N/A')}")
st.write(f"**Correct:** {error.get('correct', 'N/A')}")
st.write(f"**Severity:** {error.get('severity', 'UNKNOWN')}")
st.write(f"**Fixable:** {'Yes' if error.get('fixable', False) else 'No'}")
# Show explanation
explanations = results.get("explanations", {})
step_num = error.get("step_number", 0)
if step_num in explanations:
st.write(f"**Explanation:** {explanations[step_num]}")
# Correction results
correction = results.get("correction", {})
if correction and correction.get("fixed_count", 0) > 0:
st.subheader("๐Ÿ”ง Corrections Applied")
st.success(f"**Fixed:** {correction.get('fixed_count', 0)} / {correction.get('total_fixable', 0)} errors")
st.write(f"**Success Rate:** {correction.get('success_rate', 0.0) * 100:.1f}%")
correction_log = correction.get("correction_log", [])
if correction_log:
with st.expander("View Correction Log"):
for log_entry in correction_log:
st.write(f"**Step {log_entry['step']}:** {log_entry['type']}")
st.write(f"Original: {log_entry['original']}")
st.write(f"Corrected: {log_entry['corrected']}")
# Main UI
st.set_page_config(page_title="Math Verification System", page_icon="๐Ÿ”ข", layout="wide")
st.title("๐Ÿ”ข Mathematical Reasoning Verification System")
st.markdown("3-Model Parallel Verification with Weighted Consensus")
# Sidebar configuration
with st.sidebar:
st.header("โš™๏ธ Configuration")
gpt4_enabled = st.checkbox("GPT-4", value=True)
llama_enabled = st.checkbox("Llama 2", value=True)
gemini_enabled = st.checkbox("Gemini", value=True)
selected_models = []
if gpt4_enabled:
selected_models.append("GPT-4")
if llama_enabled:
selected_models.append("Llama 2")
if gemini_enabled:
selected_models.append("Gemini")
if not selected_models:
selected_models = ["GPT-4", "Llama 2", "Gemini"]
st.info(f"Selected models: {', '.join(selected_models)}")
st.markdown("---")
st.markdown("### ๐Ÿ“– How Models Are Used")
with st.expander("โ„น๏ธ Click to see how sidebar models work"):
st.markdown("""
**Model 1 (Symbolic) ๐Ÿ”ข:**
- Uses SymPy library (not affected by sidebar)
- Verifies arithmetic calculations
- Weight: 40%
**Model 2 (LLM Logical) ๐Ÿง :**
- Uses first selected model from sidebar
- Checks logical consistency
- Weight: 35%
**Model 3 (Ensemble) ๐Ÿค–:**
- Uses ALL selected models from sidebar
- Each model votes on solution validity
- Majority voting determines verdict
- Weight: 25%
**Note:** Currently using pattern-based simulation.
For production, integrate real LLM APIs (OpenAI, Anthropic, Google).
""")
st.markdown("### ๐Ÿ”„ Model Flow Diagram")
st.code("""
Sidebar Selection
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ GPT-4 โœ“ โ”‚
โ”‚ Llama 2 โœ“ โ”‚โ”€โ”€โ”
โ”‚ Gemini โœ“ โ”‚ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚
โ”‚
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ โ”‚
โ–ผ โ–ผ
Model 2 (LLM Logical) Model 3 (Ensemble)
Uses: GPT-4 (first selected) Uses: All selected
Weight: 35% Weight: 25%
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ”‚
โ–ผ
Consensus Mechanism
(Weighted Voting)
""", language=None)
# Main layout
col_left, col_right = st.columns([1, 1])
with col_left:
st.header("๐Ÿ“ Input")
problem = st.text_area(
"Problem:",
height=80,
placeholder="Enter the math problem here...",
value="Janet has 3 apples. She buys 2 more. She gives 1 away. How many?"
)
steps_input = st.text_area(
"Solution Steps (one per line):",
height=120,
placeholder="Enter solution steps, one per line...",
value="Janet starts with 3 apples\nShe buys 2 more: 3 + 2 = 5 apples\nShe gives 1 away: 5 - 1 = 6 apples"
)
col_btn1, col_btn2 = st.columns(2)
with col_btn1:
verify_button = st.button("๐Ÿš€ Verify Solution", type="primary", use_container_width=True)
with col_btn2:
clear_button = st.button("๐Ÿ”„ Clear", use_container_width=True)
with col_right:
st.header("๐ŸŽฏ Live Flowchart")
display_flowchart(problem=problem, steps_input=steps_input)
# Handle button clicks
if verify_button:
# Clear previous logs
st.session_state.steps_log = []
st.session_state.results = None
# Parse steps
steps = [s.strip() for s in steps_input.split('\n') if s.strip()]
if not problem or not steps:
st.warning("Please enter both problem and solution steps.")
else:
# Add initial log
add_log("Verification Started", "System", "โณ", "Initializing models...")
# Run verification
with st.spinner("Running verification..."):
try:
# Add logs for each model starting
add_log("Model 1 Started", "Symbolic", "๐Ÿ”ข", "Checking arithmetic...")
add_log("Model 2 Started", "LLM Logical", "๐Ÿง ", "Checking logical consistency...")
add_log("Model 3 Started", "Ensemble", "๐Ÿค–", "Running ensemble voting...")
# Use first selected model for LLM Logical, or default to GPT-4
llm_model_name = selected_models[0] if selected_models else "GPT-4"
results = run_verification_parallel(
problem=problem,
steps=steps,
model_name=llm_model_name,
model_list=selected_models
)
# Add completion logs
for model_name, model_result in results["model_results"].items():
verdict = model_result.get("verdict", "UNKNOWN")
errors_count = len(model_result.get("errors", []))
status = "โœ“ ERROR" if verdict == "ERROR" else "โœ“ VALID"
add_log(
f"Model {model_name} Completed",
model_name.title(),
status,
f"Found {errors_count} error(s)"
)
# Add consensus log
consensus_verdict = results["consensus"].get("final_verdict", "UNKNOWN")
add_log(
"Consensus Computed",
"Consensus",
"โš–๏ธ",
f"Final verdict: {consensus_verdict}"
)
st.session_state.results = results
except Exception as e:
st.error(f"Error during verification: {str(e)}")
add_log("Error", "System", "โŒ", str(e))
if clear_button:
st.session_state.steps_log = []
st.session_state.results = None
st.rerun()
# Display logs and results
display_logs()
display_results()