Spaces:

livecodebench
/

code_generation_samples

Running

File size: 4,027 Bytes

976ee8c
 
 
f2d5c0f
 
976ee8c
 
 
 
f2d5c0f
 
61f9bac
f2d5c0f
61f9bac
f2d5c0f
976ee8c
7983727
 
cb7f1d0
976ee8c
61f9bac
f2d5c0f
976ee8c
f2d5c0f
 
976ee8c
f2d5c0f
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2d5c0f
 
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
6fe41a3
f2d5c0f
976ee8c
 
f2d5c0f
976ee8c
 
c5259e1
 
 
 
 
 
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
 
24142d6
 
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487e952
976ee8c
 
 
 
 
61f9bac
f2d5c0f
74d76d7
0ba00fa
b3bc767
6fe41a3
0ba00fa
 
74d76d7
b3bc767
 
 
0ba00fa
b3bc767
6fe41a3
74d76d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487e952
74d76d7
 
 
 
 
 
 
f2d5c0f
da72949

import os
import random
import glob
import json

import numpy as np
from flask import Flask, render_template, request

app = Flask(__name__)


with open("problems.json") as f:
    problems = json.load(f)
    problem_choices = [q["question_title"] for q in problems]

random_idxs = list(range(len(problems)))
# random.seed(42)
# random.shuffle(random_idxs)
problems = [problems[idx] for idx in random_idxs]

with open("all_outputs.json") as f:
    all_outputs = json.load(f)
    all_models = list(all_outputs.keys())


num_questions_filtered = len(problems)

all_correctness_by_problem = {
    idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models}
    for idx in random_idxs
}


def calculate_color(performance):
    # Convert performance to a value between 0 and 1
    # Calculate the red and green components of the color
    if performance > 0.75:
        return f"rgba(0, 150, 0, 0.5)"
    elif performance > 0.5:
        return f"rgba(50, 150, 0, {performance})"
    elif performance > 0.25:
        return f"rgba(150, 50, 0, {1-performance})"
    else:
        return f"rgba(150, 0, 0,  0.5)"


all_evaluations_by_problem_colored = [
    (
        trueidx,
        {
            model: {
                "correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}",
                "correctness_color": calculate_color(
                    all_correctness_by_problem[idx][model]
                ),
            }
            for model in all_models
        },
        problems[idx]["difficulty"],
        problems[idx]["question_id"],
    )
    for trueidx, idx in enumerate(random_idxs)
]

all_data_for_view_formatted = {
    model: [
        [
            {"code": a, "pass1": b, "metadata": c}
            for a, b, c in zip(
                row["code_list"], row["pass1_list"], row["metadata_list"]
            )
        ]
        # print(row)
        for idx in random_idxs
        for row in [resp[idx]]
    ]
    for model, resp in all_outputs.items()
}


@app.route("/")
def home():
    # Fetch your data here
    print(all_models)
    return render_template(
        "index.html", models=all_models, problems=all_evaluations_by_problem_colored
    )


@app.route("/problem/<int:problem_idx>")
def problem(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in all_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem.html",
        problem_idx=problem_idx,
        question_id=all_evaluations_by_problem_colored[problem_idx][3],
        evaluation=evaluation,
        models=all_models,
        question=question,
        data=data,
    )


mini_models = [
    # "DeepSeek-V2",
    "DeepSeek-V3",
    "DeepSeek-R1-Preview",
    # "DSCoder-33b-Ins",
    # "GPT-4-Turbo-2024-04-09",
    "GPT-4O-2024-05-13",
    "Claude-3.5-Sonnet-20240620",
    "Gemini-Flash-2.0-Thinking",
    # "Gemini-Exp-1206",
    # "Claude-3-Sonnet",
    "O1-2024-12-17 (N=1) (High)",
    "QwQ-32B-Preview (N=1)",
]


@app.route("/mini")
def mini():
    # Fetch your data here
    return render_template(
        "index_mini.html",
        models=mini_models,
        problems=all_evaluations_by_problem_colored,
    )


@app.route("/problem_mini/<int:problem_idx>")
def problem_mini(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in mini_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem_mini.html",
        problem_idx=problem_idx,
        question_id=all_evaluations_by_problem_colored[problem_idx][3],
        evaluation=evaluation,
        models=mini_models,
        question=question,
        data=data,
    )


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)