Spaces:
Sleeping
Sleeping
GAIA Developer
Claude
commited on
Commit
·
b58a59f
1
Parent(s):
fb61a03
✨ Add comprehensive answer validation and scoring to interface
Browse files- Load correct answers from gaia_validation_metadata.jsonl (165 questions)
- Add validate_answer() function with 4-tier scoring:
• CORRECT (1.0): Exact case-insensitive match
• PARTIAL (0.7): Expected answer contained within response
• FUZZY (0.5): High similarity using SequenceMatcher
• INCORRECT (0.0): No meaningful match
- Enhance results table with Expected Answer, Result status, Score, and Level columns
- Add local validation scoring alongside server results
- Display exact match percentage and weighted accuracy scores
- Show real-time validation feedback during processing
- Provide detailed performance analysis in final status
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app/app.py +105 -14
- app/gaia_validation_metadata.jsonl +0 -0
app/app.py
CHANGED
|
@@ -21,6 +21,48 @@ sys.path.insert(0, '/home/user/app')
|
|
| 21 |
# --- Constants ---
|
| 22 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# --- Advanced GAIA Agent Definition ---
|
| 25 |
# ----- THIS IS WHERE OUR HIGH-PERFORMANCE AGENT IS IMPLEMENTED ------
|
| 26 |
class AdvancedGAIAAgent:
|
|
@@ -175,7 +217,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 175 |
print(f"❌ Unexpected error fetching questions: {e}")
|
| 176 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 177 |
|
| 178 |
-
# 3.
|
|
|
|
|
|
|
|
|
|
| 179 |
results_log = []
|
| 180 |
answers_payload = []
|
| 181 |
start_time = time.time()
|
|
@@ -197,26 +242,68 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 197 |
question_time = time.time() - question_start
|
| 198 |
|
| 199 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
results_log.append({
|
| 201 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 202 |
-
"Question": question_text[:
|
| 203 |
-
"
|
| 204 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
})
|
| 206 |
-
print(f"✅ Completed in {question_time:.2f}s")
|
| 207 |
|
| 208 |
except Exception as e:
|
| 209 |
print(f"❌ Error running agent on task {task_id}: {e}")
|
| 210 |
results_log.append({
|
| 211 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 212 |
-
"Question": question_text[:
|
| 213 |
-
"
|
| 214 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
})
|
| 216 |
|
| 217 |
total_time = time.time() - start_time
|
| 218 |
print(f"⏱️ Total processing time: {total_time:.2f}s")
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
if not answers_payload:
|
| 221 |
print("❌ Agent did not produce any answers to submit.")
|
| 222 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
|
@@ -245,15 +332,19 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 245 |
final_status = (
|
| 246 |
f"🎯 Submission Successful!\n"
|
| 247 |
f"👤 User: {result_data.get('username')}\n"
|
| 248 |
-
f"📊
|
| 249 |
-
f"
|
| 250 |
-
f"
|
| 251 |
-
f"
|
| 252 |
-
f"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
f"🔬 Agent Details:\n"
|
| 254 |
f"- Architecture: Advanced Multi-Modal GAIA Solver\n"
|
| 255 |
f"- Benchmark Performance: ~90% accuracy\n"
|
| 256 |
-
f"- Features: Enhanced reasoning,
|
| 257 |
)
|
| 258 |
print("✅ Submission successful.")
|
| 259 |
results_df = pd.DataFrame(results_log)
|
|
|
|
| 21 |
# --- Constants ---
|
| 22 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 23 |
|
| 24 |
+
def load_correct_answers():
|
| 25 |
+
"""Load correct answers from GAIA validation metadata."""
|
| 26 |
+
correct_answers = {}
|
| 27 |
+
try:
|
| 28 |
+
with open('gaia_validation_metadata.jsonl', 'r', encoding='utf-8') as f:
|
| 29 |
+
for line in f:
|
| 30 |
+
if line.strip():
|
| 31 |
+
data = json.loads(line.strip())
|
| 32 |
+
correct_answers[data['task_id']] = {
|
| 33 |
+
'answer': data['Final answer'],
|
| 34 |
+
'level': data.get('Level', 1),
|
| 35 |
+
'question': data.get('Question', '')
|
| 36 |
+
}
|
| 37 |
+
print(f"✅ Loaded {len(correct_answers)} correct answers for validation")
|
| 38 |
+
return correct_answers
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"⚠️ Could not load correct answers: {e}")
|
| 41 |
+
return {}
|
| 42 |
+
|
| 43 |
+
def validate_answer(our_answer: str, expected_answer: str) -> dict:
|
| 44 |
+
"""Validate our answer against the expected answer."""
|
| 45 |
+
expected = str(expected_answer).strip()
|
| 46 |
+
our_clean = str(our_answer).strip()
|
| 47 |
+
|
| 48 |
+
# Exact match (100% accuracy)
|
| 49 |
+
if our_clean.lower() == expected.lower():
|
| 50 |
+
return {"status": "CORRECT", "score": 1.0, "icon": "✅"}
|
| 51 |
+
|
| 52 |
+
# Partial match (70% accuracy) - contains expected answer
|
| 53 |
+
elif expected.lower() in our_clean.lower():
|
| 54 |
+
return {"status": "PARTIAL", "score": 0.7, "icon": "🟡"}
|
| 55 |
+
|
| 56 |
+
# Fuzzy match (50% accuracy) - similar answers
|
| 57 |
+
elif len(expected) > 3 and len(our_clean) > 3:
|
| 58 |
+
from difflib import SequenceMatcher
|
| 59 |
+
similarity = SequenceMatcher(None, our_clean.lower(), expected.lower()).ratio()
|
| 60 |
+
if similarity > 0.8:
|
| 61 |
+
return {"status": "FUZZY", "score": 0.5, "icon": "🟠"}
|
| 62 |
+
|
| 63 |
+
# Incorrect
|
| 64 |
+
return {"status": "INCORRECT", "score": 0.0, "icon": "❌"}
|
| 65 |
+
|
| 66 |
# --- Advanced GAIA Agent Definition ---
|
| 67 |
# ----- THIS IS WHERE OUR HIGH-PERFORMANCE AGENT IS IMPLEMENTED ------
|
| 68 |
class AdvancedGAIAAgent:
|
|
|
|
| 217 |
print(f"❌ Unexpected error fetching questions: {e}")
|
| 218 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 219 |
|
| 220 |
+
# 3. Load correct answers for validation
|
| 221 |
+
correct_answers = load_correct_answers()
|
| 222 |
+
|
| 223 |
+
# 4. Run Advanced GAIA Agent
|
| 224 |
results_log = []
|
| 225 |
answers_payload = []
|
| 226 |
start_time = time.time()
|
|
|
|
| 242 |
question_time = time.time() - question_start
|
| 243 |
|
| 244 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 245 |
+
|
| 246 |
+
# Validate answer if we have the correct one
|
| 247 |
+
validation_result = {"status": "UNKNOWN", "score": 0.0, "icon": "❓"}
|
| 248 |
+
correct_answer = "Not available"
|
| 249 |
+
level = "Unknown"
|
| 250 |
+
|
| 251 |
+
if task_id in correct_answers:
|
| 252 |
+
correct_data = correct_answers[task_id]
|
| 253 |
+
correct_answer = correct_data['answer']
|
| 254 |
+
level = f"Level {correct_data['level']}"
|
| 255 |
+
validation_result = validate_answer(submitted_answer, correct_answer)
|
| 256 |
+
|
| 257 |
results_log.append({
|
| 258 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 259 |
+
"Question": question_text[:80] + "..." if len(question_text) > 80 else question_text,
|
| 260 |
+
"Our Answer": submitted_answer[:50] + "..." if len(str(submitted_answer)) > 50 else submitted_answer,
|
| 261 |
+
"Expected Answer": correct_answer,
|
| 262 |
+
"Result": f"{validation_result['icon']} {validation_result['status']}",
|
| 263 |
+
"Score": f"{validation_result['score']:.1f}",
|
| 264 |
+
"Level": level,
|
| 265 |
+
"Time (s)": f"{question_time:.2f}"
|
| 266 |
})
|
| 267 |
+
print(f"✅ Completed in {question_time:.2f}s - {validation_result['icon']} {validation_result['status']}")
|
| 268 |
|
| 269 |
except Exception as e:
|
| 270 |
print(f"❌ Error running agent on task {task_id}: {e}")
|
| 271 |
results_log.append({
|
| 272 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 273 |
+
"Question": question_text[:80] + "..." if len(question_text) > 80 else question_text,
|
| 274 |
+
"Our Answer": f"ERROR: {e}",
|
| 275 |
+
"Expected Answer": correct_answers.get(task_id, {}).get('answer', 'Not available'),
|
| 276 |
+
"Result": "❌ ERROR",
|
| 277 |
+
"Score": "0.0",
|
| 278 |
+
"Level": f"Level {correct_answers.get(task_id, {}).get('level', 'Unknown')}",
|
| 279 |
+
"Time (s)": "Error"
|
| 280 |
})
|
| 281 |
|
| 282 |
total_time = time.time() - start_time
|
| 283 |
print(f"⏱️ Total processing time: {total_time:.2f}s")
|
| 284 |
|
| 285 |
+
# Calculate local accuracy scores
|
| 286 |
+
total_score = 0.0
|
| 287 |
+
validated_count = 0
|
| 288 |
+
correct_count = 0
|
| 289 |
+
|
| 290 |
+
for result in results_log:
|
| 291 |
+
try:
|
| 292 |
+
score = float(result.get('Score', '0.0'))
|
| 293 |
+
total_score += score
|
| 294 |
+
validated_count += 1
|
| 295 |
+
if score >= 1.0:
|
| 296 |
+
correct_count += 1
|
| 297 |
+
except ValueError:
|
| 298 |
+
pass
|
| 299 |
+
|
| 300 |
+
local_accuracy = (total_score / validated_count * 100) if validated_count > 0 else 0
|
| 301 |
+
exact_accuracy = (correct_count / validated_count * 100) if validated_count > 0 else 0
|
| 302 |
+
|
| 303 |
+
print(f"📊 Local Validation Results:")
|
| 304 |
+
print(f" • Exact Matches: {correct_count}/{validated_count} ({exact_accuracy:.1f}%)")
|
| 305 |
+
print(f" • Weighted Score: {total_score:.1f}/{validated_count} ({local_accuracy:.1f}%)")
|
| 306 |
+
|
| 307 |
if not answers_payload:
|
| 308 |
print("❌ Agent did not produce any answers to submit.")
|
| 309 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
|
|
|
| 332 |
final_status = (
|
| 333 |
f"🎯 Submission Successful!\n"
|
| 334 |
f"👤 User: {result_data.get('username')}\n"
|
| 335 |
+
f"📊 Server Score: {score}% ({correct_count}/{total_attempted} correct)\n"
|
| 336 |
+
f"🔍 Local Validation:\n"
|
| 337 |
+
f" • Exact Matches: {correct_count}/{validated_count} ({exact_accuracy:.1f}%)\n"
|
| 338 |
+
f" • Weighted Score: {total_score:.1f}/{validated_count} ({local_accuracy:.1f}%)\n"
|
| 339 |
+
f"⏱️ Performance:\n"
|
| 340 |
+
f" • Total Time: {total_time:.2f}s\n"
|
| 341 |
+
f" • Avg Time/Question: {total_time/len(answers_payload):.2f}s\n"
|
| 342 |
+
f"🎖️ Assessment: {'🏆 Excellent' if local_accuracy >= 80 else '🥉 Good' if local_accuracy >= 60 else '📈 Developing'}\n"
|
| 343 |
+
f"📝 Server Message: {result_data.get('message', 'No message received.')}\n\n"
|
| 344 |
f"🔬 Agent Details:\n"
|
| 345 |
f"- Architecture: Advanced Multi-Modal GAIA Solver\n"
|
| 346 |
f"- Benchmark Performance: ~90% accuracy\n"
|
| 347 |
+
f"- Features: Enhanced reasoning, 42 specialized tools, domain expertise"
|
| 348 |
)
|
| 349 |
print("✅ Submission successful.")
|
| 350 |
results_df = pd.DataFrame(results_log)
|
app/gaia_validation_metadata.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|