|
import os |
|
from typing import List, Optional |
|
from pydantic import BaseModel, Field |
|
import gradio as gr |
|
from datasets import load_dataset |
|
from huggingface_hub import InferenceClient |
|
import black |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
HF_API_URL = os.getenv("HF_API_URL", "Qwen/Qwen2.5-Coder-32B-Instruct") |
|
client = InferenceClient(model=HF_API_URL, token=HF_TOKEN) |
|
|
|
|
|
EXAM_MAX_QUESTIONS = int(os.getenv("EXAM_MAX_QUESTIONS", 1)) |
|
EXAM_DATASET_ID = "agents-course/smolagents-quiz-data" |
|
|
|
|
|
ds = load_dataset(EXAM_DATASET_ID, split="train", download_mode="force_redownload") |
|
quiz_data = list(ds) |
|
if EXAM_MAX_QUESTIONS: |
|
quiz_data = quiz_data[:EXAM_MAX_QUESTIONS] |
|
|
|
|
|
HAS_IMAGE_FEATURE = "image" in ds.features |
|
|
|
|
|
class CriterionFeedback(BaseModel): |
|
"""Feedback for a single assessment criterion""" |
|
|
|
criterion: str = Field(..., description="The assessment criterion being evaluated") |
|
met: bool = Field(..., description="Whether the criterion was met") |
|
explanation: str = Field( |
|
..., description="Detailed explanation of how well the criterion was met" |
|
) |
|
improvement_tips: Optional[str] = Field( |
|
None, description="Specific tips for improvement if needed" |
|
) |
|
|
|
|
|
class CodeFeedback(BaseModel): |
|
"""Structured feedback for code submission""" |
|
|
|
overall_feedback: str = Field( |
|
..., description="Overall assessment of the code solution" |
|
) |
|
criteria_feedback: List[CriterionFeedback] = Field( |
|
..., description="Detailed feedback for each assessment criterion" |
|
) |
|
|
|
|
|
def format_python_code(code: str) -> str: |
|
"""Format Python code using black.""" |
|
try: |
|
return black.format_str(code, mode=black.Mode()) |
|
except Exception as e: |
|
gr.Warning(f"Code formatting failed: {str(e)}") |
|
return code |
|
|
|
|
|
EVALUATION_TEMPLATE = """Evaluate this Python code solution: |
|
|
|
Challenge: |
|
{challenge} |
|
|
|
Reference Solution: |
|
```python |
|
|
|
{solution} |
|
|
|
``` |
|
|
|
Student's Solution: |
|
|
|
```python |
|
|
|
{student_code} |
|
|
|
``` |
|
|
|
Assessment Criteria: |
|
{criteria} |
|
|
|
Approach: |
|
Be highly tollerent of differences in approach, as long as they meet Assessment Criteria. |
|
|
|
Provide detailed feedback on how well each criterion was met.""" |
|
|
|
|
|
def check_code( |
|
user_code: str, solution: str, challenge: str, assessment_criteria: List[str] |
|
) -> dict: |
|
""" |
|
Use LLM to evaluate the user's code solution and provide structured feedback. |
|
""" |
|
|
|
formatted_user_code = format_python_code(user_code) |
|
formatted_solution = format_python_code(solution) |
|
|
|
|
|
criteria_text = "\n".join(f"- {c}" for c in assessment_criteria) |
|
|
|
|
|
prompt = EVALUATION_TEMPLATE.format( |
|
challenge=challenge, |
|
solution=formatted_solution, |
|
student_code=formatted_user_code, |
|
criteria=criteria_text, |
|
) |
|
|
|
try: |
|
|
|
response = client.text_generation( |
|
prompt=prompt, |
|
grammar={ |
|
"type": "json_object", |
|
"value": CodeFeedback.model_json_schema(), |
|
}, |
|
) |
|
|
|
|
|
feedback = CodeFeedback.model_validate_json(response) |
|
|
|
|
|
formatted_feedback = [ |
|
f"### Overall Assessment\n{feedback.overall_feedback}\n\n" |
|
] |
|
|
|
for cf in feedback.criteria_feedback: |
|
tip = cf.improvement_tips or "" |
|
tip_text = f"\n💡 Tip: {tip}" if tip else "" |
|
|
|
formatted_feedback.append( |
|
f"### {cf.criterion}\n" |
|
f"{'✅' if cf.met else '❌'} {cf.explanation}" |
|
f"{tip_text}\n" |
|
) |
|
|
|
return {"feedback": "\n".join(formatted_feedback)} |
|
|
|
except Exception as e: |
|
gr.Warning(f"Error generating feedback: {str(e)}") |
|
return {"feedback": "Unable to generate detailed feedback due to an error."} |
|
|
|
|
|
def on_user_logged_in(token: gr.OAuthToken | None): |
|
""" |
|
Handle user login state. |
|
On a valid token, hide the login button and reveal the Start button while keeping Next hidden. |
|
Also, clear the question text, code input, status, and image. |
|
""" |
|
if token is not None: |
|
return ( |
|
gr.update(visible=False), |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
"", |
|
gr.update(value="", visible=False), |
|
"", |
|
gr.update(value="", visible=False), |
|
) |
|
else: |
|
return ( |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
"", |
|
gr.update(value="", visible=False), |
|
"", |
|
gr.update(value="", visible=False), |
|
) |
|
|
|
|
|
def handle_quiz(question_idx, user_answers, submitted_code, is_start): |
|
"""Handle quiz state and progression""" |
|
if is_start: |
|
question_idx = 0 |
|
else: |
|
|
|
if question_idx < len(quiz_data) and submitted_code.strip(): |
|
current_q = quiz_data[question_idx] |
|
|
|
formatted_code = format_python_code(submitted_code) |
|
feedback_dict = check_code( |
|
formatted_code, |
|
current_q["solution"], |
|
current_q["challenge"], |
|
current_q["assessment_criteria"], |
|
) |
|
user_answers.append( |
|
{ |
|
"challenge": current_q["challenge"], |
|
"submitted_code": formatted_code, |
|
"correct_solution": current_q["solution"], |
|
"assessment_criteria": current_q["assessment_criteria"], |
|
"feedback": feedback_dict["feedback"], |
|
} |
|
) |
|
question_idx += 1 |
|
|
|
|
|
if question_idx >= len(quiz_data): |
|
results_text = """## Code Review Complete! 📚 |
|
This feedback should help you improve your skills. |
|
|
|
⛔️ The feedback uses Qwen/Qwen2.5-Coder-32B-Instruct to compare your response to a gold |
|
standard solution. As we know, LLMs are not perfect. You should compare your work against |
|
the assessment criteria if you doubt the feedback. |
|
|
|
Here's your detailed feedback:""" |
|
|
|
for idx, answer in enumerate(user_answers): |
|
|
|
criteria_bullets = "\n".join( |
|
f"- {c}" for c in answer["assessment_criteria"] |
|
) |
|
|
|
|
|
results_text += ( |
|
f"### Question {idx + 1}: {answer['challenge']}\n\n" |
|
"#### Your Solution:\n```python\n" |
|
f"{answer['submitted_code']}\n```\n\n" |
|
"#### Reference Solution:\n```python\n" |
|
f"{answer['correct_solution']}\n```\n\n" |
|
"#### Assessment Criteria:\n" |
|
f"{criteria_bullets}\n\n" |
|
"#### Feedback:\n" |
|
f"{answer['feedback']}\n\n" |
|
"---\n\n" |
|
) |
|
|
|
return ( |
|
"", |
|
gr.update(value="", visible=False), |
|
"Review your feedback below to improve your coding skills!", |
|
question_idx, |
|
user_answers, |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(value=results_text, visible=True), |
|
gr.update(visible=False), |
|
) |
|
else: |
|
|
|
q = quiz_data[question_idx] |
|
|
|
criteria_bullets = "\n".join(f"- {c}" for c in q["assessment_criteria"]) |
|
challenge_text = ( |
|
f"## Question {question_idx + 1}\n\n" |
|
f"### Challenge:\n{q['challenge']}\n\n" |
|
"### Assessment Criteria:\n" |
|
f"{criteria_bullets}" |
|
) |
|
|
|
|
|
show_image = HAS_IMAGE_FEATURE and q.get("image") is not None |
|
image_update = gr.update( |
|
value=q.get("image") if show_image else None, visible=show_image |
|
) |
|
|
|
return ( |
|
challenge_text, |
|
gr.update(value=q["placeholder"], visible=True), |
|
"Submit your solution and click 'Next' to continue.", |
|
question_idx, |
|
user_answers, |
|
gr.update(visible=False), |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
image_update, |
|
) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
demo.title = f"Coding Quiz: {EXAM_DATASET_ID}" |
|
|
|
question_idx = gr.State(value=0) |
|
user_answers = gr.State(value=[]) |
|
|
|
with gr.Row(variant="compact"): |
|
intro_text = """ |
|
## Welcome to the smolagents code reviewer |
|
|
|
This application will review your smolagents code, and provide feedback on your solutions. This exercise is not reviewed or certified! It's about trying out smolagents for the first time. |
|
|
|
ℹ️ Log in first, then click 'Start' to begin. Complete each coding challenge and click 'Next' to proceed. You'll get feedback on your solutions at the end.""" |
|
intro_text = gr.Markdown(intro_text) |
|
with gr.Row(variant="panel"): |
|
with gr.Column(): |
|
question_text = gr.Markdown("") |
|
question_image = gr.Image( |
|
label="Question Image", |
|
visible=True if HAS_IMAGE_FEATURE else False, |
|
type="pil", |
|
) |
|
with gr.Column(): |
|
code_input = gr.Code( |
|
language="python", label="Your Solution", visible=False |
|
) |
|
|
|
with gr.Row(variant="compact"): |
|
status_text = gr.Markdown("") |
|
|
|
with gr.Row(variant="compact"): |
|
login_btn = gr.LoginButton() |
|
start_btn = gr.Button("Start") |
|
next_btn = gr.Button("Next ⏭️", visible=False) |
|
|
|
with gr.Row(variant="compact"): |
|
final_markdown = gr.Markdown("", visible=False) |
|
|
|
login_btn.click( |
|
fn=on_user_logged_in, |
|
inputs=None, |
|
outputs=[ |
|
login_btn, |
|
start_btn, |
|
next_btn, |
|
question_text, |
|
code_input, |
|
status_text, |
|
question_image, |
|
], |
|
) |
|
|
|
start_btn.click( |
|
fn=handle_quiz, |
|
inputs=[question_idx, user_answers, code_input, gr.State(True)], |
|
outputs=[ |
|
question_text, |
|
code_input, |
|
status_text, |
|
question_idx, |
|
user_answers, |
|
start_btn, |
|
next_btn, |
|
final_markdown, |
|
question_image, |
|
], |
|
) |
|
|
|
next_btn.click( |
|
fn=handle_quiz, |
|
inputs=[question_idx, user_answers, code_input, gr.State(False)], |
|
outputs=[ |
|
question_text, |
|
code_input, |
|
status_text, |
|
question_idx, |
|
user_answers, |
|
start_btn, |
|
next_btn, |
|
final_markdown, |
|
question_image, |
|
], |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|