Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
""" | |
Gradio Web Interface for Math Validator | |
""" | |
import gradio as gr | |
import pandas as pd | |
import os | |
import subprocess | |
import sys | |
import json | |
from datetime import datetime | |
import threading | |
import queue | |
import time | |
from dotenv import load_dotenv | |
# Load environment variables from .env file | |
load_dotenv() | |
class ValidatorGUI: | |
def __init__(self): | |
self.process = None | |
self.output_queue = queue.Queue() | |
self.is_running = False | |
self.total_questions = 0 | |
self.math_questions = 0 | |
# Progress tracking | |
self.questions_processed = 0 | |
self.correct_answers = 0 | |
self.incorrect_answers = 0 | |
self.timeouts = 0 | |
self.errors = 0 | |
# Model options | |
self.openai_models = [ | |
"o3-mini", | |
"gpt-4o", | |
"gpt-5", | |
"gpt-5-mini", | |
"gpt-5-nano", | |
"gpt-4-turbo" | |
] | |
self.openrouter_models = [ | |
# Anthropic Claude 4 Series (NEW) | |
"anthropic/claude-4-opus", | |
"anthropic/claude-4-sonnet", | |
# Anthropic Claude 3.5 Series | |
"anthropic/claude-3.5-sonnet", | |
"anthropic/claude-3-5-sonnet-20241022", | |
"anthropic/claude-3-opus", | |
"anthropic/claude-3-haiku", | |
# xAI Grok Series (including Grok 4) | |
"x-ai/grok-4", | |
"x-ai/grok-2", | |
"x-ai/grok-2-1212", | |
# DeepSeek Reasoning Models (NEW) | |
"deepseek/deepseek-r1", | |
"deepseek/deepseek-v3", | |
"deepseek/deepseek-chat", | |
# Google Gemini | |
"google/gemini-2.0-pro", | |
"google/gemini-2.0-flash", | |
"google/gemini-pro-1.5", | |
"google/gemini-flash-1.5", | |
# Baidu ERNIE (NEW) | |
"baidu/ernie-4.0-turbo-8k", | |
"baidu/ernie-bot-4", | |
# Meta Llama | |
"meta-llama/llama-3.2-405b", | |
"meta-llama/llama-3.1-405b-instruct", | |
# Mistral | |
"mistralai/mistral-large", | |
"mistralai/mixtral-8x22b-instruct" | |
] | |
self.all_models = self.openai_models + self.openrouter_models | |
def get_excel_files(self): | |
"""Get list of Excel files in current directory""" | |
files = [f for f in os.listdir('.') if f.endswith('.xlsx') and not f.endswith('_validated.xlsx')] | |
return files | |
def analyze_file(self, file_path): | |
"""Analyze Excel file and return summary and question count""" | |
if not file_path: | |
return "No file selected", 0, 0 | |
try: | |
df = pd.read_excel(file_path, sheet_name='Data') | |
# Store total questions | |
self.total_questions = len(df) | |
# Count math questions | |
if 'raw_subject' in df.columns: | |
math_filter = df['raw_subject'].str.lower().str.contains( | |
'math|statistic|calculus|algebra|geometry|trigonometry', | |
na=False, regex=True | |
) | |
self.math_questions = math_filter.sum() | |
else: | |
self.math_questions = len(df) | |
# Check for images | |
image_count = 0 | |
if 'file_url' in df.columns: | |
image_count = df['file_url'].notna().sum() | |
summary = f"""### File Analysis | |
**File:** {os.path.basename(file_path)} | |
**Total rows:** {self.total_questions} | |
**Math questions:** {self.math_questions} | |
**Questions with images:** {image_count} | |
**Columns found:** {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''} | |
**Estimated processing time:** | |
- Serial: ~{self.math_questions * 30 // 60} minutes | |
- Parallel (4 processes): ~{self.math_questions * 30 // (60 * 4)} minutes | |
""" | |
return summary, self.total_questions, self.math_questions | |
except Exception as e: | |
return f"Error analyzing file: {str(e)}", 0, 0 | |
def validate_config(self, file_path, solver_model, recon_model, num_processes, batch_size): | |
"""Validate configuration before running""" | |
errors = [] | |
if not file_path or not os.path.exists(file_path): | |
errors.append("Please select a valid Excel file") | |
if not solver_model: | |
errors.append("Please select a solver model") | |
if not recon_model: | |
errors.append("Please select a reconciliation model") | |
# Check API keys | |
needs_openai = solver_model in self.openai_models or recon_model in self.openai_models | |
needs_openrouter = solver_model in self.openrouter_models or recon_model in self.openrouter_models | |
if needs_openai and not os.getenv('OPENAI_API_KEY'): | |
errors.append("OPENAI_API_KEY not found in environment") | |
if needs_openrouter and not os.getenv('OPENROUTER_API_KEY'): | |
errors.append("OPENROUTER_API_KEY not found in environment") | |
return errors | |
def generate_output_filename(self, file_path, start_q, end_q): | |
"""Generate output filename with timestamp and range""" | |
base_name = os.path.basename(file_path).replace('.xlsx', '') | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
if start_q is not None and end_q is not None and (start_q > 0 or end_q < self.math_questions): | |
# Add range to filename | |
range_str = f"_q{start_q+1}_q{end_q}" | |
else: | |
range_str = "_full" | |
return f"{base_name}_validated_{timestamp}{range_str}.xlsx" | |
def parse_progress_line(self, line): | |
"""Parse output line for progress information""" | |
# Parse based on the new [TAG] format | |
line_lower = line.lower() | |
if "[ok] got answer" in line_lower and "chars" in line_lower: | |
self.questions_processed += 1 | |
elif "[fail] failed to get answer" in line_lower: | |
self.errors += 1 | |
self.questions_processed += 1 # Still count as processed | |
elif "[match]" in line_lower: | |
self.correct_answers += 1 | |
elif "[mismatch]" in line_lower: | |
self.incorrect_answers += 1 | |
elif "[timeout]" in line_lower: | |
self.timeouts += 1 | |
elif "[error]" in line_lower: | |
if "failed after" in line_lower: | |
self.errors += 1 | |
elif "[warning]" in line_lower: | |
# Just a warning, not an error | |
pass | |
elif "question" in line_lower and "getting answer from" in line_lower: | |
# This indicates a question is starting to be processed | |
pass | |
# Also parse parallel processing output | |
elif "starting process for questions" in line_lower: | |
# Parallel process starting | |
pass | |
elif "completed range" in line_lower: | |
# Parallel process completed a range | |
import re | |
# Try to extract question count from "Completed range X-Y" | |
match = re.search(r'range (\d+)-(\d+)', line_lower) | |
if match: | |
start, end = int(match.group(1)), int(match.group(2)) | |
# This is approximate since we don't know exact results | |
self.questions_processed = max(self.questions_processed, end) | |
def get_progress_stats(self): | |
"""Get formatted progress statistics""" | |
if self.questions_processed == 0: | |
return "Waiting for processing to start..." | |
accuracy = (self.correct_answers / self.questions_processed * 100) if self.questions_processed > 0 else 0 | |
return f"""**Progress Stats:** | |
- Processed: {self.questions_processed} | |
- Correct: {self.correct_answers} ({accuracy:.1f}%) | |
- Incorrect: {self.incorrect_answers} | |
- Timeouts: {self.timeouts} | |
- Errors: {self.errors} | |
""" | |
def run_validation(self, file_path, solver_model, recon_model, image_mode, | |
num_processes, batch_size, start_q, end_q, compile_latex, progress=gr.Progress()): | |
"""Run the validation process""" | |
# Reset progress counters | |
self.questions_processed = 0 | |
self.correct_answers = 0 | |
self.incorrect_answers = 0 | |
self.timeouts = 0 | |
self.errors = 0 | |
# Validate configuration | |
errors = self.validate_config(file_path, solver_model, recon_model, num_processes, batch_size) | |
if errors: | |
yield f"### Configuration Errors\n" + "\n".join(f"- {e}" for e in errors), None, "" | |
return | |
self.is_running = True | |
output_log = [] | |
# Generate output filename | |
output_file = self.generate_output_filename(file_path, start_q, end_q) | |
output_path = os.path.join(os.path.dirname(file_path), output_file) | |
try: | |
# Prepare command | |
base_cmd = [ | |
sys.executable, "universal_validator.py", file_path, | |
"--model", solver_model, | |
"--reconciliation-model", recon_model, | |
"--images", image_mode, | |
"--batch-size", str(batch_size), | |
"--output", output_path | |
] | |
# Add range parameters if specified | |
if start_q is not None and start_q >= 0: | |
base_cmd.extend(["--start", str(start_q)]) | |
if end_q is not None and end_q > 0: | |
base_cmd.extend(["--end", str(end_q)]) | |
# Add LaTeX compilation flag if requested | |
if compile_latex: | |
base_cmd.append("--compile-latex") | |
# Use parallel processing for larger ranges | |
if num_processes > 1 and (end_q - start_q) > 20: | |
cmd = [ | |
sys.executable, "run_parallel.py", file_path, | |
"--num-processes", str(num_processes), | |
"--solver", solver_model, | |
"--reconciler", recon_model, | |
"--images", image_mode, | |
"--batch-size", str(batch_size), | |
"--output", output_path, | |
"--start-range", str(start_q), | |
"--end-range", str(end_q) | |
] | |
if compile_latex: | |
cmd.append("--compile-latex") | |
print(f"[GUI] Using parallel processing with {num_processes} processes") | |
else: | |
# Use single process for small ranges | |
cmd = base_cmd | |
if num_processes > 1 and (end_q - start_q) <= 20: | |
print(f"[GUI] Range too small for parallel processing, using single process") | |
# Start process | |
progress(0, desc="Starting validation...") | |
output_log.append(f"Running: {' '.join(cmd)}\n") | |
output_log.append(f"Output file: {output_path}\n") | |
output_log.append(f"Question range: {start_q+1} to {end_q}\n\n") | |
print(f"[GUI] Starting subprocess: {' '.join(cmd)}") | |
try: | |
self.process = subprocess.Popen( | |
cmd, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT, | |
text=True, | |
bufsize=1, | |
universal_newlines=True, | |
encoding='utf-8', | |
errors='replace' | |
) | |
print(f"[GUI] Process started with PID: {self.process.pid}") | |
except Exception as e: | |
error_msg = f"Failed to start validator: {str(e)}" | |
print(f"[GUI Error] {error_msg}") | |
yield error_msg, None, "" | |
return | |
# Read output | |
lines_processed = 0 | |
last_update_time = time.time() | |
while True: | |
line = self.process.stdout.readline() | |
if not line: | |
# Check if process is still running | |
if self.process.poll() is not None: | |
break | |
time.sleep(0.1) | |
continue | |
output_log.append(line) | |
self.parse_progress_line(line) | |
# Debug: Print every line to see what's happening | |
print(f"[GUI Debug] {line.strip()}") | |
# Update progress based on output | |
if "processing batch" in line.lower() or "question" in line.lower(): | |
lines_processed += 1 | |
if self.math_questions > 0 and self.questions_processed > 0: | |
actual_progress = min(self.questions_processed / (end_q - start_q), 1.0) | |
progress(actual_progress, desc=f"Processing question {self.questions_processed}/{end_q - start_q}") | |
# Yield intermediate results with stats every 2 seconds or every 5 lines | |
current_time = time.time() | |
if lines_processed % 5 == 0 or (current_time - last_update_time) > 2: | |
stats = self.get_progress_stats() | |
output_text = stats + "\n\n" + "="*60 + "\n" + "".join(output_log[-50:]) | |
yield output_text, None, stats | |
last_update_time = current_time | |
self.process.wait() | |
# Get final results | |
final_stats = self.get_progress_stats() | |
output_text = f"### Validation Complete\n\n{final_stats}\n\n" + "="*60 + "\n\nFull Log:\n" + "".join(output_log[-200:]) | |
# Check if output file exists | |
if os.path.exists(output_path): | |
yield output_text, output_path, final_stats | |
else: | |
# Try original naming convention as fallback | |
fallback_path = file_path.replace('.xlsx', '_validated.xlsx') | |
if os.path.exists(fallback_path): | |
yield output_text, fallback_path, final_stats | |
else: | |
yield output_text, None, final_stats | |
except Exception as e: | |
stats = self.get_progress_stats() | |
yield f"Error: {str(e)}\n\n{stats}\n\n{''.join(output_log)}", None, stats | |
finally: | |
self.is_running = False | |
self.process = None | |
def stop_validation(self): | |
"""Stop the running validation""" | |
if self.process: | |
self.process.terminate() | |
time.sleep(1) | |
if self.process.poll() is None: | |
self.process.kill() | |
return "Validation stopped" | |
return "No validation running" | |
def create_interface(self): | |
"""Create the Gradio interface""" | |
with gr.Blocks(title="Math Validator", theme=gr.themes.Soft()) as interface: | |
gr.Markdown("# Math Question Validator") | |
gr.Markdown("Web interface for validating mathematical questions and answers") | |
with gr.Tab("Validation"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
# File selection | |
file_dropdown = gr.Dropdown( | |
choices=self.get_excel_files(), | |
label="Select Excel File", | |
value=self.get_excel_files()[0] if self.get_excel_files() else None | |
) | |
refresh_btn = gr.Button("🔄 Refresh Files", size="sm") | |
file_info = gr.Markdown("Select a file to see analysis") | |
# Question range selection (dynamically updated) | |
gr.Markdown("### Question Range") | |
with gr.Row(): | |
start_question = gr.Number( | |
label="Start Question", | |
value=1, | |
minimum=1, | |
step=1, | |
info="First question to process" | |
) | |
end_question = gr.Number( | |
label="End Question", | |
value=100, | |
minimum=1, | |
step=1, | |
info="Last question to process" | |
) | |
use_all_questions = gr.Checkbox( | |
label="Process all questions", | |
value=True, | |
info="Uncheck to specify custom range" | |
) | |
with gr.Column(scale=2): | |
with gr.Row(): | |
# Model selection | |
solver_dropdown = gr.Dropdown( | |
choices=["o3-mini (recommended)"] + self.all_models, | |
value="o3-mini (recommended)", | |
label="Solver Model", | |
info="Model for answering questions" | |
) | |
recon_dropdown = gr.Dropdown( | |
choices=["gpt-4o (recommended)"] + self.all_models, | |
value="gpt-4o (recommended)", | |
label="Reconciliation Model", | |
info="Model for comparing answers" | |
) | |
with gr.Row(): | |
image_mode = gr.Radio( | |
choices=["when_needed", "always", "never"], | |
value="when_needed", | |
label="Image Handling", | |
info="When to include images with questions" | |
) | |
parallel_slider = gr.Slider( | |
minimum=1, | |
maximum=8, | |
value=1, | |
step=1, | |
label="Parallel Processes", | |
info="Number of concurrent processes (1 = serial)" | |
) | |
batch_slider = gr.Slider( | |
minimum=1, | |
maximum=20, | |
value=5, | |
step=1, | |
label="Batch Size", | |
info="Questions per batch" | |
) | |
# LaTeX compilation option | |
compile_latex = gr.Checkbox( | |
label="Compile LaTeX reconciliation documents to PDF", | |
value=False, | |
info="Requires pdflatex installed (slower but produces PDFs)" | |
) | |
with gr.Row(): | |
run_btn = gr.Button("▶️ Start Validation", variant="primary", size="lg") | |
stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg") | |
# Output section with progress stats | |
progress_stats = gr.Markdown("**Progress:** Waiting to start...") | |
output_text = gr.Textbox( | |
label="Validation Output", | |
lines=20, | |
max_lines=30, | |
value="Click 'Start Validation' to begin..." | |
) | |
output_file = gr.File( | |
label="Download Results", | |
visible=False | |
) | |
# Event handlers | |
def update_file_info(file_path): | |
if file_path: | |
full_path = os.path.join(os.getcwd(), file_path) | |
summary, total, math_q = self.analyze_file(full_path) | |
# Update end question to match file | |
return summary, math_q | |
return "No file selected", 100 | |
def refresh_files(): | |
files = self.get_excel_files() | |
return gr.update(choices=files, value=files[0] if files else None) | |
def clean_model_name(model): | |
# Remove "(recommended)" suffix if present | |
if "(recommended)" in model: | |
return model.split(" (")[0] | |
return model | |
def toggle_range_inputs(use_all): | |
# Enable/disable range inputs based on checkbox | |
return gr.update(interactive=not use_all), gr.update(interactive=not use_all) | |
def run_with_clean_models(file_path, solver, recon, images, parallel, batch, | |
use_all, start_q, end_q, compile_tex): | |
solver_clean = clean_model_name(solver) | |
recon_clean = clean_model_name(recon) | |
if file_path: | |
full_path = os.path.join(os.getcwd(), file_path) | |
# Adjust question range (convert to 0-indexed) | |
if use_all: | |
actual_start = 0 | |
actual_end = self.math_questions | |
else: | |
actual_start = max(0, int(start_q) - 1) # Convert to 0-indexed | |
actual_end = min(self.math_questions, int(end_q)) | |
# Run validation with progress updates | |
for result in self.run_validation( | |
full_path, solver_clean, recon_clean, images, parallel, batch, | |
actual_start, actual_end, compile_tex | |
): | |
if len(result) == 3: | |
result_text, result_file, stats = result | |
if result_file: | |
yield result_text, gr.update(value=result_file, visible=True), stats | |
else: | |
yield result_text, gr.update(visible=False), stats | |
else: | |
yield result[0], gr.update(visible=False), result[1] if len(result) > 1 else "" | |
else: | |
yield "No file selected", gr.update(visible=False), "" | |
file_dropdown.change(update_file_info, inputs=[file_dropdown], | |
outputs=[file_info, end_question]) | |
refresh_btn.click(refresh_files, outputs=[file_dropdown]) | |
# Toggle range inputs when checkbox changes | |
use_all_questions.change(toggle_range_inputs, inputs=[use_all_questions], | |
outputs=[start_question, end_question]) | |
run_btn.click( | |
run_with_clean_models, | |
inputs=[file_dropdown, solver_dropdown, recon_dropdown, | |
image_mode, parallel_slider, batch_slider, | |
use_all_questions, start_question, end_question, compile_latex], | |
outputs=[output_text, output_file, progress_stats] | |
) | |
stop_btn.click(self.stop_validation, outputs=[output_text]) | |
with gr.Tab("Configuration"): | |
gr.Markdown(""" | |
### API Configuration | |
Make sure you have the required API keys set as environment variables: | |
- **OPENAI_API_KEY**: Required for OpenAI models (o3-mini, GPT-5, GPT-4o) | |
- **OPENROUTER_API_KEY**: Required for Claude, Grok, Gemini, and other models | |
### Model Recommendations | |
**For best results:** | |
- Solver: o3-mini (best accuracy) | |
- Reconciliation: gpt-4o (fast and reliable) | |
**For speed:** | |
- Use 4-6 parallel processes | |
- Batch size of 5-10 | |
**For GPT-5 testing:** | |
- Use gpt-5-mini (faster than gpt-5) | |
- Use gpt-4o for reconciliation (GPT-5 has timeout issues) | |
""") | |
# Check current configuration | |
config_status = [] | |
if os.getenv('OPENAI_API_KEY'): | |
config_status.append("✅ OPENAI_API_KEY is set") | |
else: | |
config_status.append("❌ OPENAI_API_KEY is not set") | |
if os.getenv('OPENROUTER_API_KEY'): | |
config_status.append("✅ OPENROUTER_API_KEY is set") | |
else: | |
config_status.append("❌ OPENROUTER_API_KEY is not set") | |
gr.Markdown("### Current Status\n" + "\n".join(config_status)) | |
with gr.Tab("Results Analysis"): | |
gr.Markdown(""" | |
### How to Analyze Results | |
After validation completes: | |
1. **Download the validated Excel file** - Contains all results | |
2. **Check the latex_documents folder** - Contains reconciliation documents | |
3. **Run analysis scripts:** | |
- `python analyze_reconciliations.py` - Analyze which answers were vindicated | |
- `python summarize_results.py` - Get overall statistics | |
### Understanding Results | |
- **answer_match = Yes**: Model answer matches reference | |
- **answer_match = No**: Mismatch (see LaTeX reconciliation) | |
- **latex_file**: Path to detailed reconciliation document | |
- **model_answer_file**: Path to model's complete response | |
""") | |
return interface | |
def main(): | |
gui = ValidatorGUI() | |
interface = gui.create_interface() | |
interface.launch( | |
share=False, | |
server_name="127.0.0.1", | |
server_port=7860, | |
inbrowser=True | |
) | |
if __name__ == "__main__": | |
main() |